This notebook illustrates the process of building and comparing different models for classifying movie review sentiment, including Logistic Regression, k-nearest neighbor (KNN), and Support Vector Machines (SVM). 

In [None]:
import pandas as pd

#### LOAD DATASETS ####

train_data_file = "train_small.csv"
test_data_file = "test_small.csv"

# Import train and test dataset into data frames and print out the original lengths
train_data_df = pd.read_csv(train_data_file)
test_data_df = pd.read_csv(test_data_file)
print ("Original train set: ",len(train_data_df))
print ("Original test set: ",len(test_data_df))

### CLEAN DATASETS ###
# Remove empty rows from both sets and print out the new lengths
train_data_df = train_data_df[~train_data_df["review"].isnull()]
test_data_df = test_data_df[~test_data_df["review"].isnull()]
print ("After removing empty reviews, train set size: ",len(train_data_df))
print ("After removing empty reviews, test set size: ",len(test_data_df))

# Remove rows with null labels
train_data_df = train_data_df[~train_data_df["sentiment"].isnull()]
test_data_df = test_data_df[~test_data_df["sentiment"].isnull()]
print ("After removing instances with no labels, train set size: ", len(train_data_df))
print ("After removing instances with no labels, test set size: ", len(test_data_df))

# print out top 5 rows of the train set
display(train_data_df.head(5))

### Count-based feature extraction using scikit-learn CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# use original reviews for model building
train_text = train_data_df["review"]
test_text = test_data_df["review"]

# set the n-gram range
vectorizer = CountVectorizer(ngram_range = (1,1))

# create training data representation
train_data_cv = vectorizer.fit_transform(train_text)

# observe the words in the created dictionary across the document
print(len(vectorizer.vocabulary_), " ... ", list(vectorizer.vocabulary_.items())[0:100],"\n")

print(train_data_cv.shape,"\n") 

# create test data representation
test_data_cv = vectorizer.transform(test_text)
print(test_data_cv.shape,"\n")

In [None]:
# prepare training and test data and the labels
x_train = train_data_cv
y_train = train_data_df["sentiment"]
x_test = test_data_cv
y_test = test_data_df["sentiment"]

Once the training and testing data are ready, we can train models using different algorithms and compare the accuracy of the models.

## Logistic Regression using count-based features

In [None]:
# import evaluation libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

# import logistic regression
from sklearn.linear_model import LogisticRegression

# we are using the libLinear implementation of logistic regression, which tends to be more efficient
model1 = LogisticRegression(random_state=0, solver='liblinear')

# train the model 
model1.fit(x_train, y_train)

# make predictions on the test set
predictions = model1.predict(x_test)

# print evaluation results
print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

Now modify the code above so that predictions are made and evaluated on the training set. What do you observe? Why?

In [None]:
# Experiment with other solvers, l1-l2 penalty, C regularization parameter

# solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}
# penalty{‘l1’, ‘l2’, ‘elasticnet’, ‘none’}
# C regularization parameter float, default=1.0
model2 = LogisticRegression(random_state=0, solver='liblinear', penalty='l2', C=10)
model2.fit(x_train, y_train)
predictions = model2.predict(x_test)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

## KNN classifier using count-based features

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train, y_train)
predictions = neigh.predict(x_test)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

Experiment with different k parameters. What do you observe?

## SVM classifier using count-based features

Note the significant difference between the shape of the input data provided to the SVM algorithm compared to previous models. Observe the steps and find out why and how we prepare the data fed to this model.

In [None]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import SGDClassifier

# observe the label format of data
print("y_train for few examples ",y_train[0:10])

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# observe the format of label after transformation
print("y_train after encoding for those examples ",y_train[0:10])


# Feature scaling
sc = StandardScaler(with_mean=False)
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)


# observe the input format of data
print("x_train for few examples ",x_train[0:5])

# observe the format of data after transformation
print("x_train after scaling for those examples ",x_train_std[0:5])


# Create SVM model
clf1 = svm.LinearSVC()

#clf1 = SGDClassifier()

clf1.fit(x_train_std, y_train)

# predict on test data
predictions = clf1.predict(x_test_std)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

In [None]:
# Explore parameters to LinearSVC like C, penalty

# penalty{‘l1’, ‘l2’}
# loss{‘hinge’, ‘squared_hinge’}
# C Regularization parameter: float, default=1.0

# Create SVM model
clf2 = svm.LinearSVC(loss='hinge', penalty='l2', C=0.12, max_iter=1000)

# clf2 = SGDClassifier(loss="hinge", penalty="l2", max_iter=500)

clf2.fit(x_train_std, y_train)

# predict on test data
predictions = clf2.predict(x_test_std)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

## Using tf-idf features with original reviews

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()

train_data_tfidf = tf.fit_transform(train_text)
print(train_data_tfidf.shape,"\n") 

test_data_tfidf = tf.transform(test_text)
print(test_data_tfidf.shape,"\n") 

idf = tf.idf_

In [None]:
x_train = train_data_tfidf
y_train = train_data_df["sentiment"]
x_test = test_data_tfidf
y_test = test_data_df["sentiment"]

## Logistic Regression using tf-idf vectors

In [None]:
model = LogisticRegression(random_state=0,solver='liblinear')
model.fit(x_train, y_train)
predictions = model.predict(x_test)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

## KNN classifier using tf-idf vectors

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x_train, y_train)
predictions = neigh.predict(x_test)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

If you use the datasets in last week's lab, this step may be extremely slow given the computation needed for KNN and the large feature space. That's why we use train_small and test_small :)

## SVM classifier using tf-idf vectors

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

# Feature Scaling
sc = StandardScaler(with_mean=False)
sc.fit(x_train)
x_train_std = sc.transform(x_train)
x_test_std = sc.transform(x_test)

# Create SVM model
clf = svm.LinearSVC()
clf.fit(x_train_std, y_train)

# predict on test data
predictions = clf.predict(x_test_std)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

How do count and tf-idf feature vectors compare for different classifiers?