# Import

In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from matplotlib import pyplot 
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer
import re
import glob
import nltk

In [2]:
df = pd.read_csv ('IMDB-Dataset.csv')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


# Transform

In [3]:
stop_words = open("stop_words.txt")
stop_list = stop_words.readlines()

transformed_stop_list = []

for word in stop_list[:-1]:
    transformed_stop_list.append(word[:-1])

transformed_stop_list.append(stop_list[-1])

In [4]:
positive_words = open("positive_words.txt")
positive_list = positive_words.readlines()

transformed_positive_list = []

for word in positive_list[:-1]:
    transformed_positive_list.append(word[:-1])

transformed_positive_list.append(positive_list[-1])

In [5]:
negative_words = open("negative_words.txt")
negative_list = negative_words.readlines()

transformed_negative_list = []

for word in negative_list[:-1]:
    transformed_negative_list.append(word[:-1])

transformed_negative_list.append(negative_list[-1])

# Set Clean-up

In [6]:
def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#Removing the noisy text
def clean_text(text):
    text = remove_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    return text
#Apply function on review column
df['review'] = df['review'].apply(clean_text)

In [7]:
nltk.download('punkt')
stemmer = EnglishStemmer()

def stem_words(tokenizedList):
    stemmedList = []
    for word in tokenizedList:
        stemmedList.append(stemmer.stem(word))
    return stemmedList

def smarter_tokenize_and_preprocess(text):
    tokenizedWords = nltk.word_tokenize(text)
    return stem_words(tokenizedWords)
df['review'] = df['review'].apply(smarter_tokenize_and_preprocess)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gebruiker\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
df.dropna()

Unnamed: 0,review,sentiment
0,"[one, of, the, other, review, has, mention, th...",positive
1,"[a, wonder, littl, product, the, film, techniq...",positive
2,"[i, thought, this, was, a, wonder, way, to, sp...",positive
3,"[basic, there, a, famili, where, a, littl, boy...",negative
4,"[petter, mattei, love, in, the, time, of, mone...",positive
...,...,...
49995,"[i, thought, this, movi, did, a, down, right, ...",positive
49996,"[bad, plot, bad, dialogu, bad, act, idiot, dir...",negative
49997,"[i, am, a, cathol, taught, in, parochi, elemen...",negative
49998,"[im, go, to, have, to, disagre, with, the, pre...",negative


# Split set

In [8]:
df.head(10)

Unnamed: 0,review,sentiment
0,"[one, of, the, other, review, has, mention, th...",positive
1,"[a, wonder, littl, product, the, film, techniq...",positive
2,"[i, thought, this, was, a, wonder, way, to, sp...",positive
3,"[basic, there, a, famili, where, a, littl, boy...",negative
4,"[petter, mattei, love, in, the, time, of, mone...",positive
5,"[probabl, my, alltim, favorit, movi, a, stori,...",positive
6,"[i, sure, would, like, to, see, a, resurrect, ...",positive
7,"[this, show, was, an, amaz, fresh, innov, idea...",negative
8,"[encourag, by, the, posit, comment, about, thi...",negative
9,"[if, you, like, origin, gut, wrench, laughter,...",positive


In [10]:
positive = df[df['sentiment'] == 'positive']

In [12]:
negative = df[df['sentiment'] == 'negative']

In [14]:
positive_train = positive.sample(frac = 0.5)
positive_test = df.drop(positive_train.index)

In [15]:
negative_train = negative.sample(frac = 0.5)
negative_test = df.drop(negative_train.index)