## Import Data

In [1]:
# importing numpy and pandas
import numpy as np
import pandas as pd

file_path = 'data/IMDB_training.csv'

# reading the csv file into a pandas dataframe
df = pd.read_csv(file_path)

# drop Unnamed Column
df.drop(['Unnamed: 0'], axis=1, inplace=True)

df.head(5)

Unnamed: 0,review,sentiment
0,Out to Sea was a great movie. I expected comed...,positive
1,"This may not be the worst movie ever made, but...",negative
2,It has been since 1972 that I saw this movie a...,positive
3,"As so many others have written, this is a wond...",positive
4,"Rounding out the 1929-30 all-talkie ""Our Gang""...",negative


In [2]:
# what's the ratio between positive and negative reviews?
neg_count = len(df[df['sentiment']=='negative'])
print('negative reviews:', neg_count)

pos_count = len(df[df['sentiment']=='positive'])
print('positive reviews:', pos_count)

negative reviews: 15079
positive reviews: 14921


### Create smaller sample dataset

In [3]:
df = df.sample(5000)

df.shape

(5000, 2)

## Prep Data

In [4]:
# split test and training data
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)

print('training:', len(train_x))
print('test:', len(test_x))

training: 3500
test: 1500


In [5]:
#use regex to remove punctuation and lowercase everything

import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

train_x_clean = preprocess_reviews(train_x)
test_x_clean = preprocess_reviews(test_x)

### Bag of words vectorization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

# apply vectorizer to training dataset
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x_clean)

# apply vectorizer to test dataset
test_x_vectors = vectorizer.transform(test_x_clean)

print('word count:', len(vectorizer.get_feature_names()))
print(train_x_vectors.todense())

word count: 37111
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Classification

### Linear SVM

In [10]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=.5, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)

# fit training data
clf_svm.fit(train_x_vectors, train_y)

# have a look at a test data sample
print(test_x.values[6])

# use classifier to predict sentiment of sample
clf_svm.predict(test_x_vectors[6])

This seemed really similar to the CHILD'S PLAY movies except so much worse. A lawyer tries to save a criminal, who was convicted of killing his son, from execution. She fails. The lawyer's daughter then finds a puppet that the killer had buried with his son and is immediately attached to it. Then after several people are seriously injured they find the little girl secretly talking to the doll saying that she didn't hurt anyone. Throughout this movie I found myself asking myself ' why am I watching this cheeze?' over and over. The end sucked so bad that I went and watched the Disney cartoon version right after and slept with the light on.


array(['negative'], dtype=object)

### Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

# use classifier to predict sentiment
clf_dec.predict(test_x_vectors[6])

array(['positive'], dtype=object)

### Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression() # use: max_iter=1000 to avoid error
clf_log.fit(train_x_vectors, train_y)

# use classifier to predict sentiment
clf_log.predict(test_x_vectors[6])



array(['negative'], dtype=object)

### Ridge Regression

In [14]:
from sklearn.linear_model import RidgeClassifier

clf_rdg = RidgeClassifier(alpha = 3, solver = 'sag', normalize = True, tol = 0.0001)
clf_rdg.fit(train_x_vectors, train_y)

# use classifier to predict sentiment
clf_rdg.predict(test_x_vectors[6])

array(['negative'], dtype='<U8')

## Evaluation

### Accuracy Score

In [15]:
from sklearn.metrics import accuracy_score

print('SVM:', accuracy_score(test_y, clf_svm.predict(test_x_vectors)))
print('DEC:', accuracy_score(test_y, clf_dec.predict(test_x_vectors)))
print('LOG:', accuracy_score(test_y, clf_log.predict(test_x_vectors)))
print('RDG:', accuracy_score(test_y, clf_rdg.predict(test_x_vectors)))

SVM: 0.8486666666666667
DEC: 0.6666666666666666
LOG: 0.8406666666666667
RDG: 0.8446666666666667


In [16]:
from sklearn.metrics import precision_score

print('SVM:', precision_score(test_y, clf_svm.predict(test_x_vectors), pos_label="negative"))
print('DEC:', precision_score(test_y, clf_dec.predict(test_x_vectors), pos_label="negative"))
print('LOG:', precision_score(test_y, clf_log.predict(test_x_vectors), pos_label="negative"))
print('RDG:', precision_score(test_y, clf_rdg.predict(test_x_vectors), pos_label="negative"))

SVM: 0.8687150837988827
DEC: 0.6697736351531292
LOG: 0.8706896551724138
RDG: 0.8312182741116751


### F1 Scores

In [17]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=['negative', 'positive']))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=['negative', 'positive']))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=['negative', 'positive']))
print(f1_score(test_y, clf_rdg.predict(test_x_vectors), average=None, labels=['negative', 'positive']))

[0.84568321 0.85153695]
[0.66799469 0.66532798]
[0.83528601 0.84570691]
[0.84899546 0.84008236]


### Test Manually

In [18]:
# create test set
test_set = ['great show', 'bad movie, do not watch', 'what a waste of time']

# transform to sparse matrix
new_test = vectorizer.transform(test_set)

# use logistic regression classifier to predict sentiment
clf_log.predict(new_test)

array(['positive', 'negative', 'negative'], dtype=object)