# Sentiment Analysis In Textual Movie Reviews

Maxime Tchibozo

In [150]:
import string

In [151]:
import os
os.chdir('C:\\Users\\Max Tchibozo\\Desktop\\SD-TSIA214\\TP2\\data\\data')

In [152]:
# Authors: Alexandre Gramfort
#          Chloe Clavel
# License: BSD Style.
# TP Cours ML Telecom ParisTech MDI343

import os.path as op
import numpy as np

from sklearn.base import BaseEstimator, ClassifierMixin

###############################################################################
# Load data
print("Loading dataset")

from glob import glob
filenames_neg = sorted(glob(op.join('..', 'data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('..', 'data', 'imdb1', 'pos', '*.txt')))
texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))


Loading dataset
2000 documents


In [153]:
y[999],y[1000]#This is the moment when we go from the negative reviews to the positive reviews

(0, 1)

# Question 1

In [154]:
import string
###############################################################################
# Start part to fill in

def count_words(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    punctuation = list(string.punctuation)+['\n'] #We also want to remove the newline characters
    
    total_word_list = [] #will contain the words of all texts without separation
    text_word_list = [] #will separate the words of each text in a different list item
    
    for text in texts:
        for punct in punctuation:
            text = text.replace(punct,' ')
        word_list = text.split(' ')
        word_list = [x for x in word_list if x != ''] #We remove the empty strings : '' which is useless
        
        total_word_list += word_list # We build vocabulary thanks to the list of all words in all the texts
        text_word_list.append(word_list)
        
    words = list(set(total_word_list))
    vocabulary = {}
    
    for i in range(len(words)):
        vocabulary[words[i]] = i

    counts = np.zeros((2000,len(words))) #there are 2000 documents

    for i in range(len(text_word_list)):
        for j in range(len(text_word_list[i])):
            index = vocabulary[text_word_list[i][j]] #This is the index of the word in vocabulary
            counts[i][index] += 1
    
    return vocabulary, counts
    
    
count_words(texts)
        

({'graded': 0,
  'sayles': 1,
  'stubby': 2,
  'wholesome': 3,
  'rachmaninov': 4,
  'aftermath': 5,
  'rubs': 6,
  'mnemonic': 7,
  'directional': 8,
  'rich': 9,
  'alarmed': 10,
  'inversion': 11,
  '1865': 12,
  'shards': 13,
  'gosnell': 14,
  'orwellian': 15,
  'synchs': 16,
  'buffoonish': 17,
  'hailed': 18,
  'motions': 19,
  'mtcts1': 20,
  'greenhouse': 21,
  'hole\x14': 22,
  'osmond': 23,
  'phenomenas': 24,
  'planets': 25,
  'lingered': 26,
  'standards': 27,
  'treetops': 28,
  'steadiocam': 29,
  'excavating': 30,
  'weaponesque': 31,
  'imaginary': 32,
  'truant': 33,
  'wiper': 34,
  'zoe': 35,
  'barenboim': 36,
  'cromwell': 37,
  'residential': 38,
  'machinist': 39,
  'impregnating': 40,
  'precedes': 41,
  'saigon': 42,
  'oaf': 43,
  'ferguson': 44,
  'roberts': 45,
  'flicker': 46,
  'treacherous': 47,
  'motley': 48,
  'starphoenix': 49,
  'sails': 50,
  'dejection': 51,
  'subsidies': 52,
  'svenwara': 53,
  'ferrell': 54,
  'squashed': 55,
  'luggage': 56,


In [155]:
vocabulary , counts = count_words(texts)

There are 39443 different words in the vocabulary of the IMDB Movie Review database.

It is important to realise that some of these words do not hold much semantic meaning because of the pre-processing we have done: 

Composite words (i.e "Jean-Claude") and more generally words with any type of punctuation (i.e: O-M-G) are split into several individal sub-words ("Jean","Claude" and "O","M","G").

# Question 2

The class is attributed to reviews is recognized through the first explicit and identifiable rating.

These ratings are specified through star and numerical values i.e : "8/10", "four out of five", and "OUT OF  ****: ***"

There is one noteworthy specificality, which is that in the case where the identified rating is a 5 star rating with decimal points (i.e 2.5 stars, 3.5 stars), the associated rating will be the truncated value of the rating (resp. 2 stars, 3 stars).

The authors explain that this is not problematic, given that the class output of $\pm 1$ will be erroneous only when the rating was 2.5. And even then, it could be said that average reviews are negative reviews.  

# Question 3

In [157]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass

    def fit(self, X, y): #This corresponds to TrainMultinomialNB
        #X = counts
        #y = vecteur de {0,1}  label de la classe
        N = X.shape[0]
        self.prior = np.zeros(2)        
        counts_neg = X[:N//2][:]
        counts_pos = X[N//2:][:]
        counts = [counts_neg,counts_pos]
        self.condprob = np.zeros((X.shape[1],2))
        for c in range(2): #for each class (0 or 1)
            Nc = len(y)/2 #half of the docs are positive, the other half are negative
            self.prior[c] = Nc/N
            current_counts = counts[c] #This is the counts matrix of the given class
            c_total_counts = 0
            
            T = np.zeros((2, X.shape[1]))
            sums = np.zeros(2)
            for t in range(current_counts.shape[1]):
                T[c][t] = np.sum(list(current_counts[:, t]))
                sums[c] += T[c][t]

            for t in range(current_counts.shape[1]):
                self.condprob[t][c] = (T[c][t] + 1) / (sums[c] + current_counts.shape[1])
        return self

    def predict(self, X):
        
        n = X.shape[0]
        predictions = np.zeros(n, dtype=int)
        
        for i in range(X.shape[0]):
            W = np.argwhere(X[i] != 0).flatten() #Where X points are != 0

            score = np.zeros(2)
            for c in range(2):
                score[c] = np.log(self.prior[c])
                for t in W:
                    score[c] += np.log(self.condprob[t][c])
            
            predictions[i] = np.argmax(score)
      
        return predictions

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

# Count words in text
vocabulary, X = count_words(texts)

# Try to fit, predict and score
nb = NB()
nb.fit(X[::2], y[::2])
print('The score on the complete X dataset is : '+str(nb.score(X[1::2], y[1::2])))


The score on the complete X dataset is : 0.82


# Cross-Validation 5-Folds

In [158]:
from sklearn.model_selection import cross_val_score

print('The 5-fold cross-validation score is : '+str(cross_val_score(NB(), X, y, cv=5).mean()))

The 5-fold cross-validation score is : 0.8255000000000001


# Stop-Words

In [159]:
with open ('english.stop','r') as f:
    lines = f.readlines()
stop_words = [x[:-1] for x in lines]
stop_words

['a',
 "a's",
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren't",
 'around',
 'as',
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'b',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'c',
 "c'mon",
 "c's",
 'came',
 'can',
 "can't",
 'cannot',
 'cant',
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 'co',
 'com',
 'come',
 'c

In [160]:
def count_words(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    
    punctuation = list(string.punctuation)+['\n']+stop_words #We now remove the stopwords
    
    total_word_list = [] #will contain the words of all texts without separation
    text_word_list = [] #will separate the words of each text in a different list item
    
    for text in texts:
        for punct in punctuation:
            text = text.replace(punct,' ')
        
        word_list = text.split(' ')
        word_list = [x for x in word_list if x != ''] #We remove the empty strings : '' which is useless
        
        total_word_list += word_list # We build vocabulary thanks to the list of all words in all the texts
        text_word_list.append(word_list)
        
    words = list(set(total_word_list))
    vocabulary = {}
    
    for i in range(len(words)):
        vocabulary[words[i]] = i

    counts = np.zeros((2000,len(words))) #there are 2000 documents

    for i in range(len(text_word_list)):
        for j in range(len(text_word_list[i])):
            index = vocabulary[text_word_list[i][j]] #This is the index of the word in vocabulary
            counts[i][index] += 1
    
    return vocabulary, counts
    
    
count_words(texts)
        

({'1935': 0,
  '1912': 1,
  '300': 2,
  '50000': 3,
  '1865': 4,
  '1980': 5,
  '44': 6,
  '209': 7,
  '122': 8,
  '1956': 9,
  '65': 10,
  '640': 11,
  '118': 12,
  '1986': 13,
  '\x13': 14,
  '1952': 15,
  '1400': 16,
  '230': 17,
  '20': 18,
  '\x12': 19,
  '1938': 20,
  '700': 21,
  '1942': 22,
  '75': 23,
  '1925': 24,
  '\x05\x05': 25,
  '67': 26,
  '1972': 27,
  '1871': 28,
  '111': 29,
  '983': 30,
  '2259': 31,
  '1960': 32,
  '125': 33,
  '2654': 34,
  '30': 35,
  '1932': 36,
  '54': 37,
  '747': 38,
  '460': 39,
  '1982': 40,
  '1961': 41,
  '357': 42,
  '1862': 43,
  '1800': 44,
  '8216': 45,
  '41': 46,
  '19': 47,
  '1984': 48,
  '87': 49,
  '175': 50,
  '81': 51,
  '49': 52,
  '1957': 53,
  '140': 54,
  '1975': 55,
  '1987': 56,
  '3654': 57,
  '2050': 58,
  '1830': 59,
  '1991': 60,
  '1792': 61,
  '007': 62,
  '999': 63,
  '63': 64,
  '254': 65,
  '5000': 66,
  '1995': 67,
  '1985': 68,
  '1923': 69,
  '105': 70,
  '98': 71,
  '1922': 72,
  '143': 73,
  '133': 74,
  '1

In [161]:
from sklearn.model_selection import cross_val_score
vocabulary, X = count_words(texts)
nb = NB()

print('The 5-fold cross-validation score WITHOUT the stop-words is : '+str(cross_val_score(nb, X, y, cv=5).mean()))


The 5-fold cross-validation score WITHOUT the stop-words is : 0.5405


Removing the stop-words worsens the performance.

# Scikit-Learn Use

In [162]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [163]:
vectorizer =  CountVectorizer()
X = vectorizer.fit_transform(texts)
nb = MultinomialNB()
print('The 5-fold cross-validation score of the sklearn function WITHOUT the stop-words is : '+str(cross_val_score(nb, X, y, cv=5).mean()))

The 5-fold cross-validation score of the sklearn function WITHOUT the stop-words is : 0.8145


The default scikitlearn libraries yield a much better score than our hand-made estimator.

# LinearSVC

In [164]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X,y)
print('The 5-fold cross-validation score of the Linear SVC WITHOUT the stop-words is : '+str(cross_val_score(clf, X, y, cv=5).mean()))

The 5-fold cross-validation score of the Linear SVC WITHOUT the stop-words is : 0.8325000000000001


LinearSVC yields a similar score to the Sklearn Bayesian approach. However, this score is slightly higher than our hand-made Bayesian estimator.

# NLTK Stemming

In [165]:
from nltk import SnowballStemmer

In [166]:
stemmer = SnowballStemmer(language="english")
stemmer.stem('congratulations')

'congratul'

In [167]:
def count_words(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    punctuation = list(string.punctuation)+['\n'] #We also want to remove the newline characters
    
    total_word_list = [] #will contain the words of all texts without separation
    text_word_list = [] #will separate the words of each text in a different list item
    
    for text in texts:
        for punct in punctuation:
            text = text.replace(punct,' ')
        word_list = text.split(' ')
        word_list = [stemmer.stem(x) for x in word_list if x != ''] #We remove the empty strings : '' which is useless & We stem the words as they are added
        
        total_word_list += word_list # We build vocabulary thanks to the list of all words in all the texts
        text_word_list.append(word_list)
        
    words = list(set(total_word_list))
    vocabulary = {}
    
    for i in range(len(words)):
        vocabulary[words[i]] = i

    counts = np.zeros((2000,len(words))) #there are 2000 documents

    for i in range(len(text_word_list)):
        for j in range(len(text_word_list[i])):
            index = vocabulary[text_word_list[i][j]] #This is the index of the word in vocabulary
            counts[i][index] += 1
    
    return vocabulary, counts
    
    
count_words(texts)
    

({'shadi': 0,
  'rachmaninov': 1,
  'reput': 2,
  'aftermath': 3,
  'unwieldi': 4,
  'rich': 5,
  '1865': 6,
  'orwellian': 7,
  'depriv': 8,
  'buffoonish': 9,
  'andi': 10,
  'mtcts1': 11,
  'hole\x14': 12,
  'osmond': 13,
  'rhapsodi': 14,
  'titshot': 15,
  'allegori': 16,
  'glu': 17,
  'remast': 18,
  'instantan': 19,
  'steadiocam': 20,
  'slobber': 21,
  'truant': 22,
  'wiper': 23,
  'zoe': 24,
  'barenboim': 25,
  'machinist': 26,
  'introduc': 27,
  'saigon': 28,
  'oaf': 29,
  'ferguson': 30,
  'flicker': 31,
  'roberts': 32,
  'inanim': 33,
  'motley': 34,
  'starphoenix': 35,
  'svenwara': 36,
  'mullal': 37,
  'extremel': 38,
  'shape': 39,
  'dreamworld': 40,
  'norad': 41,
  'slashfest': 42,
  'fullyload': 43,
  'sexpot': 44,
  'aboard': 45,
  'folk': 46,
  'dodg': 47,
  'lighthead': 48,
  'bartlebi': 49,
  'potboil': 50,
  'romulus': 51,
  'arbuthnot': 52,
  'movement': 53,
  'highpoint': 54,
  'attribut': 55,
  'litani': 56,
  'sling': 57,
  'luftwaff': 58,
  'unwort

In [168]:
vocabulary, X = count_words(texts)

# Try to fit, predict and score
nb = NB()
print('The 5-fold cross-validation score of our hand-made estimator after stemming is : '+str(cross_val_score(nb, X, y, cv=5).mean()))

The 5-fold cross-validation score of our hand-made estimator after stemming is : 0.8210000000000001


Conclusion : 

We observe that both the stemmed and non-stemming lead to comparable results : ~0.82 on a 5-fold cross-validation. This is extremely interesting, as it means that we analyze a much smaller text dataset.

The stemmed text dataset contains only partial words, and all words which have the same root will become identical. Our set of words is much smaller as it only contains those very roots.

Stemming amounts to compressing the information, meaning we can process large amounts of text in a shorter amount of time, with comparable results.

# Part of Speech

In [169]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package punkt to C:\Users\Max
[nltk_data]     Tchibozo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Max Tchibozo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to C:\Users\Max
[nltk_data]     Tchibozo\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [170]:
from nltk import pos_tag, word_tokenize

In [171]:
pos_tag(word_tokenize("John's big idea isn't all that bad very strong. To eating, to be or not to be"),tagset='universal')


[('John', 'NOUN'),
 ("'s", 'PRT'),
 ('big', 'ADJ'),
 ('idea', 'NOUN'),
 ('is', 'VERB'),
 ("n't", 'ADV'),
 ('all', 'DET'),
 ('that', 'ADP'),
 ('bad', 'ADJ'),
 ('very', 'ADV'),
 ('strong', 'ADJ'),
 ('.', '.'),
 ('To', 'PRT'),
 ('eating', 'VERB'),
 (',', '.'),
 ('to', 'PRT'),
 ('be', 'VERB'),
 ('or', 'CONJ'),
 ('not', 'ADV'),
 ('to', 'PRT'),
 ('be', 'VERB')]

In [172]:
accepted_words = ['NOUN','VERB','ADV','ADJ']

In [173]:
def count_words(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    total_word_list = [] #will contain the words of all texts without separation
    text_word_list = [] #will separate the words of each text in a different list item
    
    for text in texts:
        sentence = pos_tag(word_tokenize(text),tagset='universal')
        word_list = [x[0] for x in sentence if x[1] in accepted_words] #We retain only the Nouns, verbs, adverbs and adjectives
        total_word_list += word_list # We build vocabulary thanks to the list of all words in all the texts
        text_word_list.append(word_list)
        
    words = list(set(total_word_list))
    vocabulary = {}
    
    for i in range(len(words)):
        vocabulary[words[i]] = i

    counts = np.zeros((2000,len(words))) #there are 2000 documents

    for i in range(len(text_word_list)):
        for j in range(len(text_word_list[i])):
            index = vocabulary[text_word_list[i][j]] #This is the index of the word in vocabulary
            counts[i][index] += 1
    
    return vocabulary, counts
    
    
count_words(texts)
    

({"o'barr": 0,
  'graded': 1,
  'anti-depressant': 2,
  'sayles': 3,
  'stubby': 4,
  'tag-line': 5,
  'wholesome': 6,
  'rachmaninov': 7,
  'aftermath': 8,
  'rubs': 9,
  'mnemonic': 10,
  'directional': 11,
  'jewelry-sporting': 12,
  'rich': 13,
  'alarmed': 14,
  'inversion': 15,
  'straight-out': 16,
  'shards': 17,
  'gosnell': 18,
  'sub-inspired': 19,
  'orwellian': 20,
  'buffoonish': 21,
  'jack/rose': 22,
  'hailed': 23,
  'motions': 24,
  'mtcts1': 25,
  'darkness_': 26,
  'greenhouse': 27,
  'hole\x14': 28,
  'osmond': 29,
  'phenomenas': 30,
  'planets': 31,
  'lingered': 32,
  'standards': 33,
  'chinese-american': 34,
  'treetops': 35,
  'out-of-body': 36,
  'steadiocam': 37,
  'excavating': 38,
  'weaponesque': 39,
  'imaginary': 40,
  'truant': 41,
  'wiper': 42,
  'zoe': 43,
  'barenboim': 44,
  'cromwell': 45,
  'residential': 46,
  'non-fans': 47,
  'machinist': 48,
  'impregnating': 49,
  'precedes': 50,
  'saigon': 51,
  'oaf': 52,
  'ferguson': 53,
  'roberts': 

In [174]:
vocabulary, X = count_words(texts)

# Try to fit, predict and score
nb = NB()
print('The 5-fold cross-validation score of our hand-made estimator after stemming is : '+str(cross_val_score(nb, X, y, cv=5).mean()))

The 5-fold cross-validation score of our hand-made estimator after stemming is : 0.8355


Using the ntlk library and keeping only nouns, verbs, adverbs and adjectives yields the best results! 

One might still prefer the stemming approach for computational and compression reasons. 