### Sentiment analysis of movie (IMDB) reviews using dataset provided by the ACL 2011 paper, see http://ai.stanford.edu/~amaas/data/sentiment/.

#### Dataset can be downloaded separately from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz, but wont be necessary as the download process has been embedded in the notebook and source file.

In [None]:
!pip install nltk
!pip install --upgrade gensim

import numpy as np
import os
import os.path

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk

import wget
import tarfile
import glob
from gensim.models import Word2Vec

import time

In [2]:
if not os.path.isfile('aclImdb_v1.tar.gz'):
    wget.download('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')   

if not os.path.isdir('aclImdb'):  
    tar = tarfile.open("aclImdb_v1.tar.gz")
    tar.extractall()
    tar.close()


100% [####################################################]           80M / 80M

In [3]:
time_beginning_of_notebook = time.time()
SAMPLE_SIZE=1000
positive_sample_file_list = glob.glob(os.path.join('aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_sample_file_list[:SAMPLE_SIZE]

negative_sample_file_list = glob.glob(os.path.join('aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_sample_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


# Data exploration

In [4]:
positive_strings = [load_doc(x) for x in positive_sample_file_list]
#print('\n Positive reviews \n ',positive_strings[:5])

negative_strings = [load_doc(x) for x in negative_sample_file_list]
#print('\n Negative reviews \n ', negative_strings[:5])
    

In [5]:
positive_tokenized = [word_tokenize(s) for s in positive_strings]
#print('\n Positive tokenized 1 \n {} \n\n Positive tokenized 2 \n {}'. format(positive_tokenized[1], positive_tokenized[2]))


In [6]:
negative_tokenized = [word_tokenize(s) for s in negative_strings]
#print('\n Negative tokenized 1 \n {} \n\n  Negative tokenized 2 \n {}'. format(negative_tokenized[1], negative_tokenized[2]))

In [7]:
# load doc into memory
with open('aclImdb/imdb.vocab', encoding='utf8') as f:
    #content = f.readlines()
    universe_vocabulary = [x.strip() for x in f.readlines()]

print("Word count across all reviews (before stripping tokens):", sum([len(token) for token in positive_tokenized]))

#Checking the not alphanumeric characters in vocabulary
non_alphanumeric_set = set()
for word in universe_vocabulary:
    non_alphanumeric_set |= set(re.findall('\W', word))
print('Non alphanumeric characters found in universe vocabulary', non_alphanumeric_set)


stripped_positive_tokenized = []
for tokens in positive_tokenized:
  stripped_positive_tokenized.append([token.lower() for token in tokens if token.lower() in universe_vocabulary])

print("Word count across all reviews (after stripping tokens):", sum([len(token) for token in stripped_positive_tokenized]))

Word count across all reviews (before stripping tokens): 269386
Non alphanumeric characters found in universe vocabulary {':', '?', ')', '!', ']', '(', ';', '=', '[', '-', '}', "'"}
Word count across all reviews (after stripping tokens): 233406


In [8]:
print("Word count across all reviews (before stripping tokens):", sum([len(token) for token in positive_tokenized]))
stripped_negative_tokenized = []
for tokens in negative_tokenized:
  stripped_negative_tokenized.append([token.lower() for token in tokens if token.lower() in universe_vocabulary])

print("Word count across all reviews (after stripping tokens):", sum([len(token) for token in stripped_negative_tokenized]))

Word count across all reviews (before stripping tokens): 269386
Word count across all reviews (after stripping tokens): 223289


## Modelling 

We have decided to do the use the below models and vectorisation techniques to test our their accuracy / score, the idea is to use a one model and one vectorization technique and plot a score.

**Simple models**

- Logistic Regression
- Random Forst
- LSTM
- GRU
- CNN

**Vectorisation techniques**
- Bag of Words
- TFIDF (weighted bag of words)
- mapping words to plain integers
- mapping words to embedding vectors

** Embeddings to try
- Word2Vec
- FastText
- Glove

## Logistic Regression.
## Introducing Pipeline: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

## Introducing TfdfVectorizer: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

## Introducing cross_val_score http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html


<br>


In [9]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_positives = pd.DataFrame({'reviews':[load_doc(x) for x in positive_sample_file_list], 'sentiment': np.ones(SAMPLE_SIZE)})
df_negatives = pd.DataFrame({'reviews':[load_doc(x) for x in negative_sample_file_list], 'sentiment': np.zeros(SAMPLE_SIZE)})

print("Positive review(s):", df_positives['reviews'][1])
print("Negative review(s):", df_negatives['reviews'][1])

df = pd.concat([df_positives, df_negatives], ignore_index=True)

df = shuffle(df)

X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)




Positive review(s): Way, way back in the 1980s, long before NAFTA was drafted and corporations began to shed their national identities, the United States and Japan were at each other's throat in the world manufacturing race. Remember sayings like 'Union Yes!,' 'the Japanese are taking this country over,' and 'Americans are lazy?'  As the Reagan era winded down and corporations edged towards a global marketplace, director Ron Howard made one of several trips into the comedy genre with his 1986 smash 'Gung Ho,' which drew over $36 million in U.S. box office receipts. While in many ways dated, Howard's tongue-in-cheek story of colliding cultures in the workplace still offers hard truth for industrial life today.  'Gung Ho' focuses on Hunt Stevenson (Michael Keaton), the automakers union rep from Hadleyville, a small, depressed town in the foothills of Pennsylvania. Stevenson has been asked to visit the Assan Motor Company in Tokyo (similar to real-life Toyota), which is considering a U.S.

CountVec = CountVectorizer()
# Creating the BoW with the set of all the documents and transforming the documents in feature vectors
bag_of_words = CountVec.fit_transform(df['reviews'])

print(type(bag_of_words))
print('\n Number of raws {} (documents) -- Number of columns {} (vocabulary) \n'.format(bag_of_words.shape[0], bag_of_words.shape[1]))

# https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
# This is a sparse matrix 
print('\n Type of bag_of_words {} \n'.format(type(bag_of_words)))
sparsity = 1.0 - bag_of_words.nnz / (bag_of_words.shape[0] * bag_of_words.shape[1])
print('\n Sparsity {} \n'.format(sparsity))

# This is a  
print('\n Type of bag_of_words.toarray {} \n'.format(type(bag_of_words.toarray())))

# In CountVec we have the vocabulary as an attribute
print('\n Type of CountVec.vocabulary {} \n'.format(type(CountVec.vocabulary_)))
print('A sample of CountVec.vocabulary_ {}'.format([(k, v) for k, v in CountVec.vocabulary_.items() if v < 1000]))

# In bag_of_words we have the vector features representing each single document 
print('\n Type of bag_of_words.toarray() {} \n')
print('\n First feature vector, representing the first document \n', bag_of_words[0, :])

# Lets get our training and test data
X_train, X_test, y_train, y_test = train_test_split(bag_of_words.toarray(), df['sentiment'].values, test_size=0.25)

clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))



results = [(predicted, actual) for predicted, actual in zip(clf.predict(X_test),  y_test) 
           if  predicted == actual]

print('Percentage of correct predicted values{}'.format(len(results)/len(y_test)))


## Logistic Regress model using Bag of Words vectorisation technique

In [10]:
CountVec = CountVectorizer()
lr_CV = Pipeline([('vect', CountVec), ('clf', LogisticRegression(random_state=0))])
lr_CV.fit(X_train, y_train)
print('Train accuracy {}'.format(lr_CV.score(X_train, y_train)))
print('Test accuracy {}'.format(lr_CV.score(X_test, y_test)))

# # Trying with cross_val_score
lr = LogisticRegression()
k_folds = 10
X_train_CV = CountVec.fit_transform(X_train)
type(X_train_CV)
print('Train accuracy list {} '.format(cross_val_score(lr, X_train_CV, y_train, cv= k_folds))) 
print('Train accuracy mean {} '.format(cross_val_score(lr, X_train_CV, y_train, cv= k_folds).mean()))



Train accuracy 1.0
Test accuracy 0.83




Train accuracy list [0.81333333 0.88       0.82666667 0.82666667 0.84       0.76666667
 0.80666667 0.79333333 0.82       0.81333333] 




Train accuracy mean 0.8186666666666668 


## Logistic Regress model using TfidfVectorizer vectorisation technique

In [11]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])
lr_tfidf.fit(X_train, y_train)
print('Train accuracy {}'.format(lr_tfidf.score(X_train, y_train)))
print('Test accuracy {}'.format(lr_tfidf.score(X_test, y_test)))

# Trying with cross_val_score
lr = LogisticRegression()
k_folds = 10
X_train_tfidf = tfidf.fit_transform(X_train)
print('Train accuracy list {} '.format(cross_val_score(lr, X_train_tfidf, y_train, cv= k_folds))) 
print('Train accuracy mean {} '.format(cross_val_score(lr, X_train_tfidf, y_train, cv= k_folds).mean()))


Train accuracy 0.9693333333333334
Test accuracy 0.838




Train accuracy list [0.80666667 0.84       0.81333333 0.84666667 0.81333333 0.75333333
 0.79333333 0.79333333 0.81333333 0.82666667] 
Train accuracy mean 0.8099999999999999 


## Logistic Regress model using TfidfVectorizer and different values for C hyperparameter

In [12]:
C_values = np.arange(1,2,0.1)
results = []

for value in C_values:   
    lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0, C=value))])
    lr_tfidf.fit(X_train, y_train)
    train_score = lr_tfidf.score(X_train, y_train)
    score = lr_tfidf.score(X_test, y_test)
    print('C_value {} Test Score {} Train_score {}'.format(value, score, train_score))
    results.append(score)

time_end_of_notebook = time.time()

C_value 1.0 Test Score 0.838 Train_score 0.9693333333333334
C_value 1.1 Test Score 0.84 Train_score 0.9713333333333334
C_value 1.2000000000000002 Test Score 0.84 Train_score 0.9733333333333334
C_value 1.3000000000000003 Test Score 0.84 Train_score 0.9766666666666667
C_value 1.4000000000000004 Test Score 0.838 Train_score 0.9813333333333333
C_value 1.5000000000000004 Test Score 0.838 Train_score 0.9833333333333333
C_value 1.6000000000000005 Test Score 0.842 Train_score 0.9846666666666667
C_value 1.7000000000000006 Test Score 0.844 Train_score 0.9846666666666667
C_value 1.8000000000000007 Test Score 0.844 Train_score 0.9866666666666667
C_value 1.9000000000000008 Test Score 0.842 Train_score 0.9893333333333333


In [13]:
table_models_vectorization = pd.DataFrame(
     {'Models':                   ["Logistic Regression", "Logistic Regression", "Logistic Regression"], 
      'Vectorisation techniques': ["Bag of Words",        "Word2Vec", "TFIDF"], 
      'Score':                    [score,                 "Pending", lr_tfidf.score(X_train, y_train) ]},
    columns=['Models','Vectorisation techniques','Score']
)
print("Sample size:", SAMPLE_SIZE)

duration = time_end_of_notebook - time_beginning_of_notebook

print("Full notebook execution duration:", duration, "seconds")
print("Full notebook execution duration:", duration / 60, "minutes")

table_models_vectorization

Sample size: 1000
Full notebook execution duration: 103.69539499282837 seconds
Full notebook execution duration: 1.7282565832138062 minutes


Unnamed: 0,Models,Vectorisation techniques,Score
0,Logistic Regression,Bag of Words,0.842
1,Logistic Regression,Word2Vec,Pending
2,Logistic Regression,TFIDF,0.989333


**The below two code blocks replaces the original/inital BoW implementation using Scikit-learn**

In [14]:
CountVec = CountVectorizer()
# Creating the BoW with the set of all the documents and transforming the documents in feature vectors
bag_of_words = CountVec.fit_transform(df['reviews'])

print(type(bag_of_words))
print('\n Number of raws {} (documents) -- Number of columns {} (vocabulary) \n'.format(bag_of_words.shape[0], bag_of_words.shape[1]))

# https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
# This is a sparse matrix 
print('\n Type of bag_of_words {} \n'.format(type(bag_of_words)))
sparsity = 1.0 - bag_of_words.nnz / (bag_of_words.shape[0] * bag_of_words.shape[1])
print('\n Sparsity {} \n'.format(sparsity))

# This is a  
print('\n Type of bag_of_words.toarray {} \n'.format(type(bag_of_words.toarray())))

# In CountVec we have the vocabulary as an attribute
print('\n Type of CountVec.vocabulary {} \n'.format(type(CountVec.vocabulary_)))
print('A sample of CountVec.vocabulary_ {}'.format([(k, v) for k, v in CountVec.vocabulary_.items() if v < 1000]))

# In bag_of_words we have the vector features representing each single document 
print('\n Type of bag_of_words.toarray() {} \n')
print('\n First feature vector, representing the first document \n', bag_of_words[0, :2])

<class 'scipy.sparse.csr.csr_matrix'>

 Number of raws 2000 (documents) -- Number of columns 25092 (vocabulary) 


 Type of bag_of_words <class 'scipy.sparse.csr.csr_matrix'> 


 Sparsity 0.9946102144109676 


 Type of bag_of_words.toarray <class 'numpy.ndarray'> 


 Type of CountVec.vocabulary <class 'dict'> 

A sample of CountVec.vocabulary_ [('2005', 181), ('1890', 69), ('1912', 80), ('admirer', 590), ('1908', 77), ('abandonment', 346), ('43', 258), ('aforementioned', 698), ('admirably', 585), ('100th', 11), ('acquitted', 501), ('1990s', 161), ('1982', 152), ('allan', 882), ('75', 297), ('1968', 135), ('adieu', 573), ('alkie', 879), ('adhering', 572), ('abused', 411), ('alfredo', 859), ('2006', 182), ('400', 253), ('amato', 965), ('amerian', 998), ('ambient', 981), ('advancing', 633), ('adaptation', 540), ('acute', 530), ('acutely', 531), ('adams', 537), ('alvin', 956), ('14', 33), ('2003', 179), ('1998', 169), ('1800s', 51), ('allergic', 890), ('amateur', 961), ('35mm', 241), ('agg

In [15]:
# Lets get our training and test data
X_train, X_test, y_train, y_test = train_test_split(bag_of_words.toarray(), df['sentiment'].values, test_size=0.25)


In [16]:
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
print('Using score function: {}'.format(clf.score(X_test, y_test)))


results = [(predicted, actual) for predicted, actual in zip(clf.predict(X_test),  y_test) 
           if  predicted == actual]

print('Percentage of correct predicted values: {}'.format(len(results)/len(y_test)))

Using score function: 0.814
Percentage of correct predicted values: 0.814


### RandomForestClassifer

In [17]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train, y_train)
print('Using score function: {}'.format(clf.score(X_test, y_test)))


results = [(predicted, actual) for predicted, actual in zip(clf.predict(X_test),  y_test) 
           if  predicted == actual]

print('Percentage of correct predicted values: {}'.format(len(results)/len(y_test)))

Using score function: 0.822
Percentage of correct predicted values: 0.822


### RandomForestClassifer with TFIDF (Pipeline)

In [18]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)


X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)

lr_tfidf = Pipeline([('vect', tfidf), ('clf', RandomForestClassifier(n_estimators=1000))])
lr_tfidf.fit(X_train, y_train)
print('Train accuracy {}'.format(lr_tfidf.score(X_train, y_train)))
print('Test accuracy {}'.format(lr_tfidf.score(X_test, y_test)))

# Trying with cross_val_score
lr = LogisticRegression()
k_folds = 10
X_train_tfidf = tfidf.fit_transform(X_train)
print('Train accuracy list {} '.format(cross_val_score(lr, X_train_tfidf, y_train, cv= k_folds))) 
print('Train accuracy mean {} '.format(cross_val_score(lr, X_train_tfidf, y_train, cv= k_folds).mean()))


Train accuracy 1.0
Test accuracy 0.826




Train accuracy list [0.82781457 0.78145695 0.82666667 0.81333333 0.72666667 0.9
 0.88       0.80666667 0.82550336 0.83892617] 
Train accuracy mean 0.8227034386713484 


### LSTM with Keras (Sequential model)


Please note that the below code is executed on GPU instances on Colab, this wont work on your local machine, use the flag to enable/disable running in CPU or GPU mode, set `run_in_GPU_mode_on_colab=false` in order to be able to run in CPU mode.

In [56]:
# NOTE this is broken - tokenisation is incorrect. Will fix in next update
import tensorflow as tf

def lstm_keras():
  from keras.models import Sequential
  from keras.layers import Dense, Activation, Embedding, LSTM
  from keras.preprocessing.text import Tokenizer
  from sklearn.preprocessing import LabelBinarizer


  X_train, X_test, y_train, y_test = train_test_split(df['reviews'], df['sentiment'], test_size=0.25)

  vocab_size = 1000
  tokenize = Tokenizer(num_words=vocab_size)
  tokenize.fit_on_texts(X_train)

  encoded_X_train = tokenize.texts_to_matrix(X_train)
  encoded_X_test = tokenize.texts_to_matrix(X_test)

  encoder = LabelBinarizer()
  encoder.fit(y_train)
  encoded_y_train = encoder.transform(y_train)
  encoded_y_test = encoder.transform(y_test)

  max_features = 1000
  model = Sequential()
  model.add(Embedding(max_features, 128, dropout=0.2))
  model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))  # try using a GRU instead, for fun
  model.add(Dense(1))
  model.add(Activation('sigmoid'))

  model.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])

  batch_size=64
  epochs=10
  history = model.fit(encoded_X_train, encoded_y_train, 
                  batch_size=batch_size, 
                  epochs=epochs, 
                  verbose=1, 
                  validation_split=0.1)

  score = model.evaluate(encoded_X_test, encoded_y_test, 
                         batch_size=batch_size, verbose=1)
  print('Test score:', score[0])
  print('Test accuracy:', score[1])


run_in_GPU_mode_on_colab=False

if run_in_GPU_mode_on_colab:  
  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True

  with tf.device('/gpu:0'):
    session_gpu = tf.Session(config=config)
    session_gpu.run(tf.global_variables_initializer())
    session_gpu.run(tf.tables_initializer())
    start = time.time()
    session_gpu.run(lstm_keras())
    end = time.time()
    gpu_time = end - start
    print('Duration on the GPU: {} seconds'.format(gpu_time))
else:
  start = time.time()
  lstm_keras()
  end = time.time()
  cpu_time = end - start
  print('Duration on the CPU: {} seconds'.format(cpu_time))



Train on 1350 samples, validate on 150 samples
Epoch 1/2
Epoch 2/2
Test score: 0.6931333312988281
Test accuracy: 0.4959999995231628
Duration on the CPU: 128.34940028190613


TODO
- dense neural network with bag of words
- dense neural network with fixed size input
- LSTM
- CNN