# Movie Reviews Classification

# Library

In [1]:
import pandas as pd
import nltk


nltk.download('stopwords')
from IPython.display import clear_output
clear_output(wait=False)

# Importing and analysing data

In [2]:
data = pd.read_csv("./data/IMDB_Dataset.csv", converters={"review": lambda x: x.lower()})
print(data.shape)
data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


The data set is balanced, this is very important to not get any bias.

In [3]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Splitting data into training and testing data sets
The training set will consist of 2 thirds of the original data set with an even number of positive and negative reviews.

The testing set will consist of a third of the original data set, we won't make it balanced, we'll take one random third of the last rows.

In [10]:
positive_set = data.loc[data['sentiment'] == 'positive']
training_set = positive_set.iloc[:17500]

In [11]:
negative_set = data.loc[data['sentiment'] == 'negative']
training_set = pd.concat([training_set, negative_set.iloc[:17500]])

In [12]:
del positive_set, negative_set

In [13]:
testing_set = data.iloc[35000:]
testing_set['sentiment'].value_counts()

sentiment
positive    7510
negative    7490
Name: count, dtype: int64

In [14]:
# Shuffling the data sets to remove bias
training_set = training_set.sample(frac=1).reset_index(drop=True)
testing_set = testing_set.sample(frac=1).reset_index(drop=True)

In [15]:
print(training_set.shape)
training_set.head()

(35000, 2)


Unnamed: 0,review,sentiment
0,oz is by far the best show ever to grace telev...,positive
1,this is definitely one of the best kung fu mov...,negative
2,"like others, i have seen and studied most of t...",positive
3,two things are changed from then.first of all ...,negative
4,tom & jerry are visiting africa and disguise t...,negative


In [16]:
print(testing_set.shape)
testing_set.head()

(15000, 2)


Unnamed: 0,review,sentiment
0,"it's all there: two classic anti-hero buddies,...",positive
1,"""love and human remains"" is one of those obvio...",negative
2,what can i say? curse of monkey island is fant...,positive
3,lost is an extremely well made tv series about...,positive
4,"""when a small bavarian village is beset with a...",negative


# Normalizing data

To classify text, we have to normalize it so a computer can understand it.

The steps will be :

- Putting all the text in lower case
- Removing punctuation and html specific keyword (such as < br/ >)
- Transforming the text into tokens
- Removing any stopword
- Using a lemmatizer on the text

With the converters parameter of read_csv, we already turned the text into lowercase.

In [45]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def normalize_text(text):
    # lower case -> done in read_csv
    # removing all punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # removing html specific keywords
    #todo
    
    # transforming txt into tokens
    tokens = tokenizer.tokenize(text)
    # removing stopwords
    stop=set(stopwords.words('english'))
    tokens_to_filter = [token.strip() for token in tokens]
    tokens = [token for token in tokens_to_filter if token not in stopwords]
    # using a lemmatizer on the text
    lemma = WordNetLemmatizer()
    [lemma.lemmatize(word] for word in tokens]
    return tokens

In [None]:
data['review'] = data['review'].apply(normalize_text)