# Review Classification 

In [1]:
import pandas as pd
import numpy as np
import os
import string
import textwrap
import re
import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
data = pd.read_csv("./../DATA/IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
data = data.iloc[:10000]

In [6]:
data.shape

(10000, 2)

In [7]:
# frequency count
data.sentiment.value_counts()

sentiment
positive    5028
negative    4972
Name: count, dtype: int64

In [8]:
# check for missing values
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
# check for duplicates
data.duplicated().sum()

17

In [10]:
# drop duplicates
data = data.drop_duplicates()
data.shape

(9983, 2)

In [11]:
def print_wrapped(text, max_cols=80):
    wrapped = textwrap.fill(text, width=max_cols)
    print(wrapped)

In [12]:
def remove_html(text):
    text = re.sub(r"<.*?>", "", text)
    return text

In [13]:
# remove html tags
data.review = data.review.apply(remove_html)

In [14]:
print_wrapped(data.review[3])

Basically there's a family where a little boy (Jake) thinks there's a zombie in
his closet & his parents are fighting all the time.This movie is slower than a
soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.OK,
first of all when you're going to make a film you must Decide if its a thriller
or a drama! As a drama the movie is watchable. Parents are divorcing & arguing
like in real life. And then we have Jake with his closet which totally ruins all
the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a
drama with some meaningless thriller spots.3 out of 10 just for the well playing
parents & descent dialogs. As for the shots with Jake: just ignore them.


In [15]:
eng_stopwords = stopwords.words("english")

In [16]:
def remove_stopwords(text: str):
    text = " ".join([word for word in text.split() if word not in eng_stopwords])
    return text

In [17]:
# remove stopwords
data.review = data.review.apply(remove_stopwords)

In [18]:
print_wrapped(data.review[3])

Basically there's family little boy (Jake) thinks there's zombie closet &
parents fighting time.This movie slower soap opera... suddenly, Jake decides
become Rambo kill zombie.OK, first going make film must Decide thriller drama!
As drama movie watchable. Parents divorcing & arguing like real life. And Jake
closet totally ruins film! I expected see BOOGEYMAN similar movie, instead
watched drama meaningless thriller spots.3 10 well playing parents & descent
dialogs. As shots Jake: ignore them.


In [19]:
# prepare data 
labelEncoder = LabelEncoder()
X = data.review
y = labelEncoder.fit_transform(data.sentiment)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

In [21]:
X_train.shape, X_test.shape

((7986,), (1997,))

In [22]:
# BOW 
countVectorizer = CountVectorizer()
bow_X_train = countVectorizer.fit_transform(X_train)
bow_X_test = countVectorizer.transform(X_test)

In [23]:
pd.DataFrame(data=bow_X_train.toarray(), columns=countVectorizer.get_feature_names_out()).head(3)

Unnamed: 0,00,000,00001,007,00am,00s,01,01pm,02,03,...,était,étoile,être,ísnt,île,ïn,óli,önsjön,über,ünfaithful
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
pd.DataFrame(data=bow_X_test.toarray(), columns=countVectorizer.get_feature_names_out()).head(3)

Unnamed: 0,00,000,00001,007,00am,00s,01,01pm,02,03,...,était,étoile,être,ísnt,île,ïn,óli,önsjön,über,ünfaithful
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# train model 
model = GaussianNB()
model.fit(bow_X_train.toarray(), y_train)

In [26]:
y_pred = model.predict(bow_X_test.toarray())

In [27]:
confusion_matrix(y_pred, y_test)

array([[717, 499],
       [235, 546]])

In [28]:
accuracy_score(y_pred, y_test)

0.6324486730095142

In [29]:
# RandomForestClassifier
rfc_model = RandomForestClassifier()
rfc_model.fit(bow_X_train.toarray(), y_train)

In [30]:
y_pred_rfc = rfc_model.predict(bow_X_test.toarray())

In [31]:
confusion_matrix(y_pred_rfc, y_test)

array([[801, 173],
       [151, 872]])

In [32]:
accuracy_score(y_pred_rfc, y_test)

0.8377566349524287

In [48]:
# ngrams 
countVectorizer = CountVectorizer(ngram_range=(2, 2), max_features=5000)

bow_X_train = countVectorizer.fit_transform(X_train).toarray()
bow_X_test = countVectorizer.transform(X_test).toarray()

In [35]:
rfc_model_ngram = RandomForestClassifier()
rfc_model_ngram.fit(bow_X_train, y_train)

In [38]:
y_pred_ngram = rfc_model_ngram.predict(bow_X_test)

In [39]:
confusion_matrix(y_pred_ngram, y_test)

array([[772, 316],
       [180, 729]])

In [None]:
accuracy_score(y_pred_ngram, y_test) # bow is better 

0.7516274411617426

In [49]:
# tfidf
tfidfTransformer = TfidfTransformer()
tfidf_X_train = tfidfTransformer.fit_transform(bow_X_train).toarray()
tfidf_X_test = tfidfTransformer.transform(bow_X_test).toarray()


In [51]:
rfc_model_tfidf = RandomForestClassifier()
rfc_model_tfidf.fit(tfidf_X_train, y_train)

In [52]:
y_pred_tfidf = rfc_model_tfidf.predict(tfidf_X_test)

In [53]:
confusion_matrix(y_pred_tfidf, y_test)

array([[755, 276],
       [197, 769]])

In [55]:
accuracy_score(y_pred_tfidf, y_test)

0.7631447170756134