In [1]:
# This is the book example of sentiment analysis
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [2]:
# Locations
path_movie_dataset = os.path.join("book_code", "Section 6", "movies.txt")

In [3]:
# Load the dataset
dataset = pd.read_csv(path_movie_dataset, sep = '\t', header = None, names = ['Sentiment', 'Review'])

In [5]:
# Print out a sample of the dataset
dataset.head()

Unnamed: 0,Sentiment,Review
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [6]:
dataset.shape

(6918, 2)

In [7]:
# Convert the text into features without removing the stop words
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(dataset['Review'])
X_counts.shape

(6918, 2132)

In [8]:
# We now split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_counts, dataset.Sentiment, test_size=0.2, random_state=111)

In [9]:
# MLP with size of 5 and 2, L2 regularization term is 1e-5
clf = MLPClassifier(alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 1)

In [10]:
# Train the neural network
clf.fit(X = X_train, y = y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [11]:
# Checking accuracy on the training set
y_pred = clf.predict(X_train)
print("Accuracy on the training set: {:.2f}".format(accuracy_score(y_train, y_pred)))

Accuracy on the training set: 1.00


In [12]:
# A "near perfect" accuracy on the training set may be a sign of overfitting, let's double check
scores = cross_val_score(clf, X_train, y_train, cv = 5)
print("Cross validation score {:.2f} (+/- {:.2f})".format(scores.mean(), scores.std() * 2))

Cross validation score 0.99 (+/- 0.01)


In [13]:
# Now let's repeat the process but removing the stop words
no_stop_words_count_vect = CountVectorizer(stop_words = 'english')
X_nstopw_count = no_stop_words_count_vect.fit_transform(dataset['Review'])
X_nstopw_count.shape

(6918, 1921)

In [14]:
# Train and testing subdatasets
X_nstopw_train, X_nstopw_test, y_nstopw_train, y_nstopw_test = train_test_split(X_nstopw_count, dataset.Sentiment, test_size = 0.2, random_state = 1234)

In [15]:
# ANN model
clf_nstopw = MLPClassifier(max_iter = 1000, alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 1)

In [16]:
# Train the ANN model
clf.fit(X = X_nstopw_train, y = y_nstopw_train)



MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)