## Import Data

In [1]:
# importing numpy and pandas
import numpy as np
import pandas as pd

file_path = './data/IMDB Dataset.csv'

# reading the csv file into a pandas dataframe
df = pd.read_csv(file_path)

df.head

<bound method NDFrame.head of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

In [3]:
# what's the ratio between positive and negative reviews?
neg_count = len(df[df['sentiment']=='negative'])
print('negative reviews:', neg_count)

pos_count = len(df[df['sentiment']=='positive'])
print('positive reviews:', pos_count)

negative reviews: 25000
positive reviews: 25000


### Create smaller sample dataset

In [4]:
df = df.sample(5000)

df.shape

(5000, 2)

## Prep Data

In [6]:
# split test and training data
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(df['review'], df['sentiment'], test_size=0.3, random_state=42)

print('training:', len(train_x))
print('test:', len(test_x))

training: 3500
test: 1500


### Bag of words vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# example text
text = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text)

print('bag of words:', vectorizer.get_feature_names())
print('word count:', len(vectorizer.get_feature_names()))
print(X.toarray())

bag of words: ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
word count: 9
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [8]:
from sklearn.feature_extraction.text import CountVectorizer #, TfidfVectorizer

# apply vectorizer to training dataset
vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

# apply vectorizer to test dataset
test_x_vectors = vectorizer.transform(test_x)

print('count:', len(vectorizer.get_feature_names()))
print(train_x_vectors.toarray())

count: 33039
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Classification

### Linear SVM

In [10]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

# fit training data
clf_svm.fit(train_x_vectors, train_y)

# have a look at a test data sample
print(test_x.values[4])

# use classifyer to predict sentiment
clf_svm.predict(test_x_vectors[4])

This is a great movie that I don't think gets enough credit as Saturday Night Fever or Grease in John Travolta's career. He plays a man who is in love with a girl but is too pig headed to admit his feelings to her. Instead, he wants to engage in mechanical bull riding because he thinks it will show his manhood. Even though it was made in 1980, it is still timely today. The great country music soundtrack is terrific. 10/10


array(['positive'], dtype=object)

### Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

# use classifyer to predict sentiment
clf_dec.predict(test_x_vectors[4])

array(['positive'], dtype=object)

### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression() # use: max_iter=1000 to avoid error
clf_log.fit(train_x_vectors, train_y)

# use classifyer to predict sentiment
clf_log.predict(test_x_vectors[4])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['positive'], dtype=object)

### Ridge Regression

In [18]:
from sklearn.linear_model import RidgeClassifier

clf_rdg = RidgeClassifier() #use: alpha = 3, solver = 'sag', normalize = True, tol = 0.0001
clf_rdg.fit(train_x_vectors, train_y)

# use classifyer to predict sentiment
clf_rdg.predict(test_x_vectors[4])

array(['positive'], dtype='<U8')

## Evaluation

### Accuracy Score

In [19]:
from sklearn.metrics import accuracy_score

print('SVM:', accuracy_score(test_y, clf_svm.predict(test_x_vectors)))
print('DEC:', accuracy_score(test_y, clf_dec.predict(test_x_vectors)))
print('LOG:', accuracy_score(test_y, clf_log.predict(test_x_vectors)))
print('RDG:', accuracy_score(test_y, clf_rdg.predict(test_x_vectors)))

SVM: 0.842
DEC: 0.666
LOG: 0.852
RDG: 0.7873333333333333


In [20]:
from sklearn.metrics import precision_score

print('SVM:', precision_score(test_y, clf_svm.predict(test_x_vectors), pos_label="negative"))
print('DEC:', precision_score(test_y, clf_dec.predict(test_x_vectors), pos_label="negative"))
print('LOG:', precision_score(test_y, clf_log.predict(test_x_vectors), pos_label="negative"))
print('RDG:', precision_score(test_y, clf_rdg.predict(test_x_vectors), pos_label="negative"))

SVM: 0.8398950131233596
DEC: 0.6730245231607629
LOG: 0.8577181208053691
RDG: 0.793010752688172


### F1 Scores

In [21]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=['negative', 'positive']))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=['negative', 'positive']))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=['negative', 'positive']))
print(f1_score(test_y, clf_rdg.predict(test_x_vectors), average=None, labels=['negative', 'positive']))

[0.8437706  0.84018881]
[0.66353257 0.6684315 ]
[0.852 0.852]
[0.78719146 0.78747502]


### Test Manually

In [23]:
# create test set
test_set = ['great show', 'bad movie, do not watch', 'what a waste of time']

# transform to sparse matrix
new_test = vectorizer.transform(test_set)

# use logistic regression classifier to predict sentiment
clf_log.predict(new_test)

array(['positive', 'negative', 'negative'], dtype=object)