###### Kernel notebook for cleaning up the textual data for model building purposes

In [None]:
### Imports ####

import matplotlib
from matplotlib import pyplot as plt
matplotlib.pyplot.style.use('ggplot')

from sklearn import datasets, linear_model, metrics, model_selection, pipeline, preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [None]:
### import custom extended stop words, train and test data ###

with open('stoppers.txt', encoding = 'utf-8') as f:
    stoppers = f.readlines()
    
stoppers = [x.strip() for x in stoppers]

stopwords_new = stopwords.words('english')

for word in stoppers:
    stopwords_new.append(word)
    
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
### Functions for cleaning data ###

def remove_html(text):
    soup = BeautifulSoup(text, 'lxml')
    html_free = soup.get_text()
    return html_free

def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

tokenizer = RegexpTokenizer(r'\w+')

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords_new]
    return words

lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text

stemmer = PorterStemmer()

def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

def process_arg(x : str):
    x = x[1:-1]
    x = x.replace("u\'", "")
    x = x.replace("\'", "")
    #l = x.split(',')
    #x = l[0]
    return x

In [None]:
### Clean dialogue text ###

train_data['dialogue'] = train_data['dialogue'].apply(lambda x: remove_html(x))
train_data['dialogue'] = train_data['dialogue'].apply(lambda x: tokenizer.tokenize(x.lower()))
train_data['dialogue'] = train_data['dialogue'].apply(lambda x: remove_stopwords(x))
train_data['dialogue'] = train_data['dialogue'].apply(lambda x: word_stemmer(x))

In [None]:
### Clean genres ###

train_data['genres'] = train_data['genres'].apply(lambda x: process_arg(x))

In [None]:
### Clean test text ###

test_data['dialogue'] = test_data['dialogue'].apply(lambda x: remove_html(x))
test_data['dialogue'] = test_data['dialogue'].apply(lambda x: tokenizer.tokenize(x.lower()))
test_data['dialogue'] = test_data['dialogue'].apply(lambda x: remove_stopwords(x))
test_data['dialogue'] = test_data['dialogue'].apply(lambda x: word_stemmer(x))

In [None]:
### Export data ###

train_data.to_csv('clean_train_stem.csv', index=False)
test_data.to_csv('clean_test_stem.csv', index=False)