## Import Data

In [1]:
# importing numpy and pandas
import numpy as np
import pandas as pd

file_path = './data/IMDB Dataset.csv'

# reading the csv file into a pandas dataframe
df = pd.read_csv(file_path)

df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
# what's the ratio between positive and negative reviews?
neg_count = len(df[df['sentiment']=='negative'])
print('negative reviews:', neg_count)

pos_count = len(df[df['sentiment']=='positive'])
print('positive reviews:', pos_count)

negative reviews: 25000
positive reviews: 25000


### Create smaller sample dataset

In [3]:
df = df.sample(5000)

df.shape

(5000, 2)

## Prep Data

In [4]:
# split test and training data
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)

print('training:', len(train_x))
print('test:', len(test_x))

training: 3500
test: 1500


In [5]:
#use regex to remove punctuation and lowercase everything

import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

train_x_clean = preprocess_reviews(train_x)
test_x_clean = preprocess_reviews(test_x)

train_x_clean

['saw this movie when it came out and then a couple more times years later im watching it now 20 years later and its still a very good story does it wreak of lifetime movie network yes but alas lifetime was not even in existence back then so it needed somehwere to air the cast was excellent the story was a little schmaltzy two women become close friends and unbeknownst to either one friend is having an affair with he other friends husband shes invited over to the house for a dinner party which is how she discovers that her lover is the husband of her best friend she is horrified and tries to break off the affair shortly afterwards he is tragically killed in a car accident which is devatating for both women of course the wife finds out by accident about this affair and wants holly out of her life now but their friendship is able to prevail because they need each other i thought it was a very good story a great cast and perfomrances i really enjoyed it',
 'it is amazing what you can see 

### Bag of words vectorization

In [6]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

# apply vectorizer to training dataset
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

# apply vectorizer to test dataset
test_x_vectors = vectorizer.transform(test_x)

print('word count:', len(vectorizer.get_feature_names()))
print(train_x_vectors.todense())

word count: 33035
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Classification

### Linear SVM

In [7]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=.5, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)

# fit training data
clf_svm.fit(train_x_vectors, train_y)

# have a look at a test data sample
print(test_x.values[4])

# use classifier to predict sentiment of sample
clf_svm.predict(test_x_vectors[4])

I've seen the first of the dwarf-Movies and sometimes I had little fun watching it. There are many famous TV/Comedyactors appearing in the first part and presented, in fantasy costumes, typical little episodes of their Stand-Up-Program and exactly that is the problem the second movie has to struggle with. Everything was already there....nothing new to obtain. You're familiar with most of the often boring and dumb "jokes" and you always feel like their goal was to put in every Comedylooser of the last decade who wants to get back on stage. There's nothing important about the story: typical fairy-tale story of Rumpelstiltskin, without any importance. I expected something like that but that's nothing I could complain about. I'm actually complaining about the lazy story writers who had an entire background story; their only business was to get many jokes and parodies inside but they didn't get it anyway. This crap is except the great appearance of Helge Schneider a total waste of time and 

array(['negative'], dtype=object)

### Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

# use classifier to predict sentiment
clf_dec.predict(test_x_vectors[4])

array(['positive'], dtype=object)

### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression() # use: max_iter=1000 to avoid error
clf_log.fit(train_x_vectors, train_y)

# use classifier to predict sentiment
clf_log.predict(test_x_vectors[4])



array(['negative'], dtype=object)

### Ridge Regression

In [10]:
from sklearn.linear_model import RidgeClassifier

clf_rdg = RidgeClassifier(alpha = 3, solver = 'sag', normalize = True, tol = 0.0001)
clf_rdg.fit(train_x_vectors, train_y)

# use classifier to predict sentiment
clf_rdg.predict(test_x_vectors[4])

array(['negative'], dtype='<U8')

## Evaluation

### Accuracy Score

In [11]:
from sklearn.metrics import accuracy_score

print('SVM:', accuracy_score(test_y, clf_svm.predict(test_x_vectors)))
print('DEC:', accuracy_score(test_y, clf_dec.predict(test_x_vectors)))
print('LOG:', accuracy_score(test_y, clf_log.predict(test_x_vectors)))
print('RDG:', accuracy_score(test_y, clf_rdg.predict(test_x_vectors)))

SVM: 0.8546666666666667
DEC: 0.6646666666666666
LOG: 0.846
RDG: 0.8453333333333334


In [12]:
from sklearn.metrics import precision_score

print('SVM:', precision_score(test_y, clf_svm.predict(test_x_vectors), pos_label="negative"))
print('DEC:', precision_score(test_y, clf_dec.predict(test_x_vectors), pos_label="negative"))
print('LOG:', precision_score(test_y, clf_log.predict(test_x_vectors), pos_label="negative"))
print('RDG:', precision_score(test_y, clf_rdg.predict(test_x_vectors), pos_label="negative"))

SVM: 0.8698630136986302
DEC: 0.6666666666666666
LOG: 0.8604651162790697
RDG: 0.8254950495049505


### F1 Scores

In [13]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=['negative', 'positive']))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=['negative', 'positive']))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=['negative', 'positive']))
print(f1_score(test_y, clf_rdg.predict(test_x_vectors), average=None, labels=['negative', 'positive']))

[0.85349462 0.85582011]
[0.66973079 0.65944482]
[0.84486232 0.84712111]
[0.85185185 0.83821478]


### Test Manually

In [14]:
# create test set
test_set = ['great show', 'bad movie, do not watch', 'what a waste of time']

# transform to sparse matrix
new_test = vectorizer.transform(test_set)

# use logistic regression classifier to predict sentiment
clf_log.predict(new_test)

array(['positive', 'negative', 'negative'], dtype=object)