# Movie Reviews Classification

# Library

In [1]:
import pandas as pd
import nltk
nltk.download('stopwords')

# Normalize text
import re
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import WordNetLemmatizer

from IPython.display import clear_output
clear_output(wait=False)

# Importing and analysing data

In [2]:
data = pd.read_csv("./data/IMDB_Dataset.csv", converters={"review": lambda x: x.lower()})
print(data.shape)
data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


The data set is balanced, this is very important to not get any bias.

In [3]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Splitting data into training and testing data sets
The training set will consist of 2 thirds of the original data set with an even number of positive and negative reviews.

The testing set will consist of a third of the original data set, we won't make it balanced, we'll take one random third of the last rows.

In [4]:
positive_set = data.loc[data['sentiment'] == 'positive']
training_set = positive_set.iloc[:17500]

In [5]:
negative_set = data.loc[data['sentiment'] == 'negative']
training_set = pd.concat([training_set, negative_set.iloc[:17500]])

In [6]:
del positive_set, negative_set

In [7]:
testing_set = data.iloc[35000:]
testing_set['sentiment'].value_counts()

sentiment
positive    7510
negative    7490
Name: count, dtype: int64

In [8]:
# Shuffling the data sets to remove bias
training_set = training_set.sample(frac=1).reset_index(drop=True)
testing_set = testing_set.sample(frac=1).reset_index(drop=True)

In [9]:
print(training_set.shape)
training_set.head()

(35000, 2)


Unnamed: 0,review,sentiment
0,one of the best! as being a fan of the civil w...,positive
1,having decided some time ago to collect the fi...,negative
2,"true love, i truly enjoyed and loved this movi...",positive
3,the story is about ankush (abhay deol) - who i...,positive
4,i have not seen it since 1998 and yet i still ...,positive


In [10]:
print(testing_set.shape)
testing_set.head()

(15000, 2)


Unnamed: 0,review,sentiment
0,in its depiction of a miserable milanese under...,negative
1,arnold once again in the 80's demonstrated tha...,positive
2,i never fail to be amazed and horrified by the...,positive
3,this movie was beautiful and touching. it touc...,positive
4,bette davis brings her full trunk of tics to t...,negative


# Normalizing data

To classify text, we have to normalize it so a computer can understand it.

The steps will be :

- Putting all the text in lower case (With the converters parameter of read_csv, we already did this step.)
- Removing punctuation and html specific keyword (such as < br/ >) : we'll use re and BeautifulSoup.
- Transforming the text into tokens
- Removing any stopword : it consists of removing words with poor semantic interest, nltk has a built in list to which I'll add the words "movie" and "review".
- Using a lemmatizer on the text : it consists of using the same word for derivatives of said word, removing conjugation from verbs, plural from words etc.



In [11]:
# We'll use a simple, general tokeniser
toktoktokeniser = ToktokTokenizer()

In [12]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append("movie")
stopwords.append("review")

In [13]:
def normalize_text(text):
    # lower case -> done in read_csv
    # removing html specific keywords
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    
    # removing all punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # transforming txt into tokens
    tokens = toktoktokeniser.tokenize(text)
    # removing stopwords
    tokens_to_filter = [token.strip() for token in tokens]
    tokens = [token for token in tokens_to_filter if token not in stopwords]
    
    # using a lemmatizer on the text
    lemma = WordNetLemmatizer()
    final_tokens = [lemma.lemmatize(word) for word in tokens]
    text = ' '.join(final_tokens)
    return text

In [16]:
training_set['review'] = training_set['review'].apply(normalize_text)
clear_output(wait=False)
training_set

Unnamed: 0,review,sentiment
0,one best fan civil war pleased first installme...,positive
1,decided time ago collect film billy bob thornt...,negative
2,true love truly enjoyed loved fun funny inspir...,positive
3,story ankush abhay deol professional marriage ...,positive
4,seen since 1998 yet still cant get head stop r...,positive
...,...,...
34995,despite one john cusack demi moore early film ...,negative
34996,costly film produced sir alexander korda hg we...,positive
34997,probably blame sure hell expected go title bla...,negative
34998,first series brilliant easily one best horror ...,negative


In [15]:
testing_set['review'] = testing_set['review'].apply(normalize_text)
clear_output(wait=False)
testing_set

Unnamed: 0,review,sentiment
0,depiction miserable milanese underclass film p...,negative
1,arnold 80 demonstrated king action one liner f...,positive
2,never fail amazed horrified evil predicated hi...,positive
3,beautiful touching touched place deep inside i...,positive
4,bette davis brings full trunk tic miserable fl...,negative
...,...,...
14995,wish film middle aged people intellectual jour...,positive
14996,halloween film get watch every time day yeari ...,positive
14997,robert heinlein classic novel starship trooper...,positive
14998,return cabin lake way stand original one main ...,negative


# 