This notebook illustrates the process of building and comparing different models for classifying movie review sentiment, including Logistic Regression, k-nearest neighbor (KNN), and Support Vector Machines (SVM). 

In [1]:
import pandas as pd

#### LOAD DATASETS ####

train_data_file = "train_small.csv"
test_data_file = "test_small.csv"

# Import train and test dataset into data frames and print out the original lengths
train_data_df = pd.read_csv(train_data_file)
test_data_df = pd.read_csv(test_data_file)
print ("Original train set: ",len(train_data_df))
print ("Original test set: ",len(test_data_df))

### CLEAN DATASETS ###
# Remove empty rows from both sets and print out the new lengths
train_data_df = train_data_df[~train_data_df["review"].isnull()]
test_data_df = test_data_df[~test_data_df["review"].isnull()]
print ("After removing empty reviews, train set size: ",len(train_data_df))
print ("After removing empty reviews, test set size: ",len(test_data_df))

# Remove rows with null labels
train_data_df = train_data_df[~train_data_df["sentiment"].isnull()]
test_data_df = test_data_df[~test_data_df["sentiment"].isnull()]
print ("After removing instances with no labels, train set size: ", len(train_data_df))
print ("After removing instances with no labels, test set size: ", len(test_data_df))

# print out top 5 rows of the train set
display(train_data_df.head(5))

Original train set:  10000
Original test set:  2500
After removing empty reviews, train set size:  10000
After removing empty reviews, test set size:  2500
After removing instances with no labels, train set size:  10000
After removing instances with no labels, test set size:  2500


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Count-based feature extraction using scikit-learn CountVectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# use original reviews for model building
train_text = train_data_df["review"]
test_text = test_data_df["review"]

# set the n-gram range
vectorizer = CountVectorizer(ngram_range = (1,1))

# create training data representation
train_data_cv = vectorizer.fit_transform(train_text)

# observe the words in the created dictionary across the document
print(len(vectorizer.vocabulary_), " ... ", list(vectorizer.vocabulary_.items())[0:100],"\n")

print(train_data_cv.shape,"\n") 

# create test data representation
test_data_cv = vectorizer.transform(test_text)
print(test_data_cv.shape,"\n")


52522  ...  [('one', 32799), ('of', 32624), ('the', 46654), ('other', 33143), ('reviewers', 38996), ('has', 21177), ('mentioned', 29519), ('that', 46645), ('after', 1404), ('watching', 50735), ('just', 25230), ('oz', 33501), ('episode', 15679), ('you', 52219), ('ll', 27402), ('be', 4335), ('hooked', 22227), ('they', 46753), ('are', 2793), ('right', 39215), ('as', 3012), ('this', 46806), ('is', 24390), ('exactly', 16108), ('what', 51062), ('happened', 21019), ('with', 51540), ('me', 29232), ('br', 6007), ('first', 17476), ('thing', 46779), ('struck', 44844), ('about', 788), ('was', 50689), ('its', 24476), ('brutality', 6474), ('and', 2204), ('unflinching', 48907), ('scenes', 40654), ('violence', 50167), ('which', 51117), ('set', 41482), ('in', 23209), ('from', 18505), ('word', 51720), ('go', 19676), ('trust', 48109), ('not', 32203), ('show', 42050), ('for', 17988), ('faint', 16652), ('hearted', 21401), ('or', 32960), ('timid', 47073), ('pulls', 36772), ('no', 32020), ('punches', 36802),

scipy.sparse._csr.csr_matrix

In [6]:
# prepare training and test data and the labels
x_train = train_data_cv
y_train = train_data_df["sentiment"]   # true train labels
x_test = test_data_cv
y_test = test_data_df["sentiment"]     # true test labels 

Once the training and testing data are ready, we can train models using different algorithms and compare the accuracy of the models.

## Logistic Regression using count-based features

In [7]:
# import evaluation libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

# import logistic regression
from sklearn.linear_model import LogisticRegression

# we are using the libLinear implementation of logistic regression, which tends to be more efficient
model1 = LogisticRegression(random_state=0, solver='liblinear')

# train the model 
model1.fit(x_train, y_train)

# make predictions on the test set
predictions = model1.predict(x_test)

# print evaluation results
print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

Accuracy score:  0.8716
Individual label performance: 
              precision    recall  f1-score   support

    negative       0.88      0.87      0.87      1257
    positive       0.87      0.88      0.87      1243

    accuracy                           0.87      2500
   macro avg       0.87      0.87      0.87      2500
weighted avg       0.87      0.87      0.87      2500

[[1088  169]
 [ 152 1091]]


Now modify the code above so that predictions are made and evaluated on the training set. What do you observe? Why?

In [10]:
# Experiment with other solvers, l1-l2 penalty, C regularization parameter

# solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
# penalty{‘l1’, ‘l2’, ‘elasticnet’, ‘none’}
# C regularization parameter float, default=1.0
model2 = LogisticRegression(random_state=0, solver='liblinear', penalty='l2', C=10)
model2.fit(x_train, y_train)
predictions = model2.predict(x_test)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

predictions

Accuracy score:  0.8624
Individual label performance: 
              precision    recall  f1-score   support

    negative       0.87      0.86      0.86      1257
    positive       0.86      0.86      0.86      1243

    accuracy                           0.86      2500
   macro avg       0.86      0.86      0.86      2500
weighted avg       0.86      0.86      0.86      2500

[[1081  176]
 [ 168 1075]]


array(['negative', 'negative', 'negative', ..., 'positive', 'positive',
       'positive'], dtype=object)

## KNN classifier using count-based features

In [18]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=9)
neigh.fit(x_train, y_train)
predictions = neigh.predict(x_test)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

print(predictions)

Accuracy score:  0.6348
Individual label performance: 
              precision    recall  f1-score   support

           0       0.66      0.55      0.60      1257
           1       0.61      0.72      0.66      1243

    accuracy                           0.63      2500
   macro avg       0.64      0.64      0.63      2500
weighted avg       0.64      0.63      0.63      2500

[[697 560]
 [353 890]]
[1 1 0 ... 0 1 0]


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## SVM classifier using count-based features

Note the significant difference between the shape of the input data provided to the SVM algorithm compared to previous models. Observe the steps and find out why and how we prepare the data fed to this model.

In [16]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import SGDClassifier

# observe the label format of data
print("y_train for few examples ",y_train[0:20])

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# observe the format of label after transformation
print("y_train after encoding for those examples ",y_train[0:10])


# Feature scaling
sc = StandardScaler(with_mean=False)
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)


# observe the input format of data
print("x_train for few examples ",x_train[0:5])

# observe the format of data after transformation
print("x_train after scaling for those examples ",x_train_std[0:5])


# Create SVM model
clf1 = svm.LinearSVC()

#clf1 = SGDClassifier()

clf1.fit(x_train_std, y_train)

# predict on test data
predictions = clf1.predict(x_test_std)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

y_train for few examples  [1 1 1 0 1 1 1 0 0 1 0 0 0 0 1 0 1 0 1 0]
y_train after encoding for those examples  [1 1 1 0 1 1 1 0 0 1]
x_train for few examples    (0, 788)	1
  (0, 963)	1
  (0, 1404)	1
  (0, 1432)	1
  (0, 1491)	1
  (0, 1774)	1
  (0, 2145)	1
  (0, 2204)	6
  (0, 2603)	1
  (0, 2793)	2
  (0, 2904)	1
  (0, 3009)	1
  (0, 3012)	4
  (0, 3378)	1
  (0, 3588)	2
  (0, 4335)	2
  (0, 4446)	1
  (0, 4574)	1
  (0, 5123)	1
  (0, 6007)	6
  (0, 6474)	1
  (0, 6832)	2
  (0, 7063)	1
  (0, 7158)	1
  (0, 7828)	1
  :	:
  (4, 46670)	1
  (4, 46680)	1
  (4, 46739)	4
  (4, 46753)	1
  (4, 46779)	1
  (4, 46806)	2
  (4, 47054)	2
  (4, 47195)	7
  (4, 47680)	1
  (4, 48685)	1
  (4, 49485)	2
  (4, 49719)	1
  (4, 50249)	1
  (4, 50286)	1
  (4, 50724)	1
  (4, 50794)	1
  (4, 50809)	4
  (4, 51062)	1
  (4, 51101)	1
  (4, 51117)	1
  (4, 51509)	1
  (4, 51540)	1
  (4, 51731)	1
  (4, 51754)	1
  (4, 52206)	1
x_train after scaling for those examples    (0, 788)	0.9291190302891659
  (0, 963)	23.58700498884833
  (0, 1404)

In [20]:
# Explore parameters to LinearSVC like C, penalty

# penalty{‘l1’, ‘l2’}
# loss{‘hinge’, ‘squared_hinge’}
# C Regularization parameter: float, default=1.0

# Create SVM model
#clf2 = svm.LinearSVC(loss='hinge', penalty='l2', C=0.12, max_iter=1000)

clf2 = SGDClassifier(loss="hinge", penalty="l2", max_iter=500)

clf2.fit(x_train_std, y_train)

# predict on test data
predictions = clf2.predict(x_test_std)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

Accuracy score:  0.7764
Individual label performance: 
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      1257
           1       0.78      0.76      0.77      1243

    accuracy                           0.78      2500
   macro avg       0.78      0.78      0.78      2500
weighted avg       0.78      0.78      0.78      2500

[[991 266]
 [293 950]]


## Using tf-idf features with original reviews

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()

train_data_tfidf = tf.fit_transform(train_text)
print(train_data_tfidf.shape,"\n") 

test_data_tfidf = tf.transform(test_text)
print(test_data_tfidf.shape,"\n") 

idf = tf.idf_

(10000, 52522) 

(2500, 52522) 



In [22]:
x_train = train_data_tfidf
y_train = train_data_df["sentiment"]
x_test = test_data_tfidf
y_test = test_data_df["sentiment"]

## Logistic Regression using tf-idf vectors

In [23]:
model = LogisticRegression(random_state=0,solver='liblinear')
model.fit(x_train, y_train)
predictions = model.predict(x_test)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

Accuracy score:  0.878
Individual label performance: 
              precision    recall  f1-score   support

    negative       0.88      0.87      0.88      1257
    positive       0.87      0.88      0.88      1243

    accuracy                           0.88      2500
   macro avg       0.88      0.88      0.88      2500
weighted avg       0.88      0.88      0.88      2500

[[1097  160]
 [ 145 1098]]


## KNN classifier using tf-idf vectors

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train, y_train)
predictions = neigh.predict(x_test)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

If you use the datasets in last week's lab, this step may be extremely slow given the computation needed for KNN and the large feature space. That's why we use train_small and test_small :)

## SVM classifier using tf-idf vectors

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# Feature Scaling
sc = StandardScaler(with_mean=False)
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)

# Create SVM model
clf = svm.LinearSVC()
clf.fit(x_train_std, y_train)

# predict on test data
predictions = clf.predict(x_test_std)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

How do count and tf-idf feature vectors compare for different classifiers?