In [10]:
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

import pandas as pd
import re 

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from bs4 import BeautifulSoup


df = pd.read_csv('../datasets/train.csv')
print(df.head(5))


                                              review  sentiment
0  I know that Chill Wills usually played lovable...          1
1  The arrival of an world famous conductor sets ...          1
2  This documentary is such a wonderful example o...          1
3  I really tried to like this movie but in the e...          0
4  Not one of Monogram's better(not trying to be ...          0


In [11]:
print(df.loc[0,'review'])

I know that Chill Wills usually played lovable old sorts in Westerns. But his role in this segment is something I've remembered for a long time. Wills could be a first rate villain. Yes, Burgess Meredith's Fall was correct! That look in Hepplewhite's eye! It expressed porcine greed, ignorance, and the threat of violence all at once. Quite a performance, I think.<br /><br />The segment itself was a good one, too. Question: couldn't the little black bag cure alcoholism? I guess it did, sort of, with Fall. But the doctor would have been wise to apply the cure, if he had it, as quickly as possible to Hepplewhite.<br /><br />There is one moment that was annoying but also necessary. And it is something that appears to recur in these Night Gallery segments. It's Serling's constant need to sermonize. For that's what we got, one more time, with Dr. Fall. I don't know what was more frustrating, losing the black bag and all its miracles or not being to stop Fall from preaching about the bag's ben

In [12]:
def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)

    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [14]:
df['review'] = df['review'].apply(preprocessor)
print(df.head(5))

                                              review  sentiment
0  i know that chill wills usually played lovable...          1
1  the arrival of an world famous conductor sets ...          1
2  this documentary is such a wonderful example o...          1
3  i really tried to like this movie but in the e...          0
4  not one of monogram s better not trying to be ...          0


In [15]:
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mygodimatomato/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
df['review'] = df['review'].apply(tokenizer_stem_nostop)
print(df.head(5))

                                              review  sentiment
0  [know, chill, will, usual, play, lovabl, old, ...          1
1  [arriv, world, famou, conductor, set, unexpect...          1
2  [documentari, wonder, exampl, entertain, amaz,...          1
3  [realli, tri, like, movi, end, work, seen, kit...          0
4  [one, monogram, better, tri, amus, either, cha...          0


In [17]:
df.to_csv('../datasets/train_processed.csv', index=False)

In [18]:
df = pd.read_csv('../datasets/test.csv')
df['review'] = df['review'].apply(preprocessor)
df['review'] = df['review'].apply(tokenizer_stem_nostop)
df.to_csv('../datasets/test_processed.csv', index=False)

In [None]:
test_dataset = pd.read_csv('../datasets/test_processed.csv') 

