# Sentiment analysis in textual movie reviews

### Load data

In [1]:
import os.path as op
import re
import numpy as np

In [2]:
print("Loading dataset")

from glob import glob
filenames_neg = sorted(glob(op.join('data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('data', 'imdb1', 'pos', '*.txt')))

def open_perso(f):
    with open(f, "r") as file:
        return file.read()

texts_neg = [open_perso(f) for f in filenames_neg]
texts_pos = [open_perso(f) for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

Loading dataset
2000 documents


## Implementation of the classifier

### 1. Complete the count_words function that will count the number of occurrences of each distinct word in a list of string and return vocabulary (the python dictionary) and counts

In [3]:
def count_words(texts, stop_words = []):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts
        
    stop_words : unused here. (Only for compatibily with Q.5)

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    
    n_samples = len(texts)
    
    # Determine n_features and delete punctuation.
    words = dict()
    n_features = 0
    texts_sub = []
    for text in texts:
        texts_sub.append(re.sub("[\n\r\-\_\@\$\&,:;.!?'\"]", " ", text))
        
        for word in texts_sub[-1].split(" "):
            if word == "":
                continue
            if word not in words:
                words[word] = n_features
                n_features += 1
    
    
    # Count words by documents and store it in an array.
    counts = np.zeros((n_samples, n_features))
    
    for i in range(n_samples):
        for word in texts_sub[i].split(" "):
            if word == "":
                continue
            j = words[word]
            counts[i,j] += 1
    
    return words, counts


### 2. Explain how positive and negative classes have been assigned to movie reviews (see poldata.README.2.0 file)

Class has been assigned thanks to explicit words/rules. For instance : ("8/10", "four out of five") are some explicit pattern recognize. Then given a note, class can be assigned : for example, with a five-star system, three-and-a-half stars and up are considered positive. 

### 3. Complete the NBclass to implement the Naive Bayes classifier

The use of the vocab done here is not optimal. We have to compute a new one for each X given to predict y. It would probably be better to cut the count_words implemented here in two part, one to compute the vocab, and the other one to count the words given a vocab.

Moreover one could use the **counts** array as X for the classifier rather than **texts**. (Some operations could be faster with the optimized np arrays) Specially for computing the cross validation score. We could transform once texts in counts (as counts is what takes most time to compute

In [5]:
from sklearn.base import BaseEstimator, ClassifierMixin

class NB(BaseEstimator, ClassifierMixin):
    """Naive Bayes classifier."""
    def __init__(self, stop_words = []):
        self.vocab = dict()
        self.classes = dict()
        self.classes_inv = dict() # Useful to invert classes at the end
        self.class_probability = np.array([])
        self.word_prob_in_class = np.array([])
        
        # Used in Q.5
        self.stop_words = stop_words

    def fit(self, X, Y):
        assert len(X) == len(Y), "Sizes of X and Y don't match."
        
        # Link a class to an index
        i = 0
        for cls in Y:
            if cls not in self.classes:
                self.classes[cls] = i
                self.classes_inv[i] = cls
                i += 1
        
        
        # P(cls = c) for all c
        self.class_probability = np.zeros(len(self.classes))
        
        for cls in Y:
            self.class_probability[self.classes[cls]] += 1
        self.class_probability /= len(Y)
        
        self.vocab, counts = count_words(X, self.stop_words)
        
        # P(word = w_i | cls = c) for all w_i, c
        # Smoothing : start with 1.
        self.word_prob_in_class = np.ones((len(self.classes), len(self.vocab)))
        
        for j in range(len(self.vocab)):
            for i in range(len(X)):
                cls_ind = self.classes[Y[i]]
                self.word_prob_in_class[cls_ind, j] += counts[i, j]
        
        # Normalize each line (class)
        self.word_prob_in_class /= np.sum(self.word_prob_in_class, axis = 1).reshape((len(self.classes), 1))        
        
        return self

    def predict(self, X):
        vocab, counts = count_words(X, self.stop_words)
        
        scores = np.ones((len(X), len(self.classes))) * np.log(self.class_probability)
        
        #for i in range(len(X)): ## NOT OPTI
        #    for w in vocab:
        #        scores[i] += counts[i,vocab[w]] * np.log(self.word_prob_in_class[:,self.vocab[w]])
        
        for w in vocab: ## BETTER
            ind_1 = vocab[w]
            ind_2 = self.vocab.get(w, -1)
            if (ind_2 == -1):
                continue
            scores += np.dot(counts[:,ind_1:ind_1+1], np.log(self.word_prob_in_class[:, ind_2:ind_2+1]).T)
        
        result = np.argmax(scores, axis = 1)
        
        for i in range(len(result)):
            result[i] = self.classes_inv[result[i]]
        return result
            

    def score(self, X, Y):
        return np.mean(self.predict(X) == Y)

In [6]:
print("Score on the train set :", NB().fit(texts, y).score(texts, y))

Score on the train set : 0.9655


### 4. Evaluate the performance of your classifier in cross-validation 5-folds. 

In [7]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(NB(), texts, y, cv=5)

In [8]:
print("Mean score on a 5-fold CV : ", scores.mean())

Mean score on a 5-fold CV :  0.8115


### 5. Change the count_words function to ignore the “stop words” in the file data/english.stop. Are the performances improved ?

In [9]:
def count_words(texts, stop_words):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts
    stop_words : list of str
        The words to ignore

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    
    n_samples = len(texts)
    
    # Determine n_features and delete punctuation.
    words = dict()
    n_features = 0
    texts_sub = []
    for text in texts:
        texts_sub.append(re.sub("[\n\r\-\_\@\$\&,:;.!?'\"]", " ", text))
        
        for word in texts_sub[-1].split(" "):
            if word == "" or (word in stop_words):
                continue
            if word not in words:
                words[word] = n_features
                n_features += 1
    
    
    # Count words by documents and store it in an array.
    counts = np.zeros((n_samples, n_features))
    
    for i in range(n_samples):
        for word in texts_sub[i].split(" "):
            if word == "" or word in stop_words:
                continue
            j = words[word]
            counts[i,j] += 1
    
    return words, counts

In [10]:
with open("data/english.stop", "r") as f:
    stop_words = f.read().split("\n")

In [11]:
print("Score on the train set :", NB(stop_words).fit(texts, y).score(texts, y))

Score on the train set : 0.976


In [12]:
scores = cross_val_score(NB(stop_words), texts, y, cv=5)
print("Mean score on a 5-fold CV : ", scores.mean())

Mean score on a 5-fold CV :  0.8019999999999999


The performances aren't improved when we delete the stop words.

## Scikit-learn use 

### 1. Compare your implementation with scikitlearn. 

#### With words 

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [14]:
clf = MultinomialNB()
vectorizer = CountVectorizer()

bayes_classifier = Pipeline([("Vectorize", vectorizer), ("Classify", clf)])

In [15]:
print("Score on the train set:", bayes_classifier.fit(texts,y).score(texts, y))

Score on the train set: 0.97


In [16]:
scores = cross_val_score(bayes_classifier, texts, y, cv=5)
print("Mean score on a 5-fold CV : ", scores.mean())

Mean score on a 5-fold CV :  0.812


#### With char 

In [17]:
clf = MultinomialNB()
vectorizer = CountVectorizer(analyzer = 'char')

bayes_classifier = Pipeline([("Vectorize", vectorizer), ("Classify", clf)])

scores = cross_val_score(bayes_classifier, texts, y, cv=5)
print("Mean score on a 5-fold CV : ", scores.mean())

Mean score on a 5-fold CV :  0.6094999999999999


Sklearn is much quicker to compute. (But one can notice that their 'count words' methods is divided in two (fit and transform). I still believe that we should do the same for the part I). 

And it gives around the same result as we have in part I. We can see that counting the character is far less efficient that the words. That makes sense as we lost the meaning of words when we do such a thing. 

### 2. Test another classification methodscikitlearn(ex : LinearSVC, LogisticRegression).

In [18]:
from sklearn.svm import LinearSVC

clf = LinearSVC(max_iter = 20000)
vectorizer = CountVectorizer()

svc_classifier = Pipeline([("Vectorize", vectorizer), ("Classify", clf)])

scores = cross_val_score(svc_classifier, texts, y, cv=5)
print("Mean score on a 5-fold CV : ", scores.mean())

Mean score on a 5-fold CV :  0.8325000000000001


Performances are a bit better with a svm model. But it takes a bit more time to compute than the Naive Bayes Classifier with sklearn.

### 3. Use NLTK library in order to process a stemming. 

In [19]:
from nltk import SnowballStemmer

class Counter:
    """Do the same as count_words but including a stemmer and separating the fit from the transform (as in sklearn)."""
    
    def __init__(self, stop_words = None, stemmer = None):
        self.vocab = dict()
        self.n_features = 0
        
        self.stop_words = []
        if stop_words:
            self.stop_words = stop_words
        self.stemmer = stemmer
        
    def fit(self, texts, *args, **kwargs):
        self.vocab = dict()
        self.n_features = 0
        
        # Determine n_features, vocab.
        # Delete punctuation.
        for text in texts:
            text = re.sub("[\n\r\-\_\@\$\&,:;.!?'\"]", " ", text)

            for word in text.split(" "):
                if self.stemmer:
                    word = self.stemmer.stem(word)
                
                if word == "" or word in self.stop_words:
                    continue
                if word not in self.vocab:
                    self.vocab[word] = self.n_features
                    self.n_features += 1
        return self
                    
    def transform(self, texts, *args, **kwargs):
        # Count words by documents and store it in an array.
        n_samples = len(texts)
        counts = np.zeros((n_samples, self.n_features)) # Could use sparse matrix as sklearn ?

        for i in range(n_samples):
            text = re.sub("[\n\r\-\_\@\$\&,:;.!?'\"]", " ", texts[i])
            for word in text.split(" "):
                if self.stemmer:
                    word = self.stemmer.stem(word)
                
                if word == "" or word in self.stop_words or word not in self.vocab:
                    continue
                j = self.vocab[word]
                counts[i,j] += 1
        return counts

In [20]:
counter = Counter(stemmer = SnowballStemmer("english"))
clf = MultinomialNB()

bayes_classifier = Pipeline([("Vectorize", counter), ("Classify", clf)])

scores = cross_val_score(bayes_classifier, texts, y, cv=5)
print("Mean score on a 5-fold CV : ", scores.mean())

Mean score on a 5-fold CV :  0.8099999999999999


Stemming hasn't improve the performances.

### 4. Filter words by grammatical category (POS : Part Of Speech) and keep only nouns,verbs, adverbs and adjectives for classification. 

In [21]:
from nltk import pos_tag

class Counter:
    """Do the same as count_words but keep only the right grammatical category 
    Also separating the fit from the transform (as in sklearn)."""
    
    def __init__(self, stop_words = None):
        self.vocab = dict()
        self.n_features = 0
        
        self.stop_words = []
        if stop_words:
            self.stop_words = stop_words
            
    def fit(self, texts, *args, **kwargs):
        self.vocab = dict()
        self.n_features = 0
        
        # Determine n_features, vocab.
        # Delete punctuation.
        for text in texts:
            text = re.sub("[\n\r\-\_\@\$\&'\"]", " ", text)
            fltr = map(lambda x:x[0], filter(lambda x: x[1][0] in "NVRJ", pos_tag(list(filter(lambda x: x!="", text.split(" "))))))
            for word in fltr:
                if word == "" or word in self.stop_words:
                    continue
                if word not in self.vocab:
                    self.vocab[word] = self.n_features
                    self.n_features += 1

        return self
                    
    def transform(self, texts, *args, **kwargs):
        # Count words by documents and store it in an array.
        n_samples = len(texts)
        counts = np.zeros((n_samples, self.n_features)) # Could use sparse matrix as sklearn ?

        for i in range(n_samples):
            text = re.sub("[\n\r\-\_\@\$\&'\"]", " ", texts[i])
            fltr = map(lambda x:x[0], filter(lambda x: x[1][0] in "NVRJ", pos_tag(list(filter(lambda x: x != "", text.split(" "))))))
            for word in fltr:                
                if word == "" or word in self.stop_words or word not in self.vocab:
                    continue
                j = self.vocab[word]
                counts[i,j] += 1
        return counts

In [22]:
counter = Counter()
clf = MultinomialNB()

bayes_classifier = Pipeline([("Vectorize", counter), ("Classify", clf)])

scores = cross_val_score(bayes_classifier, texts, y, cv=5, error_score= 'raise')
print("Mean score on a 5-fold CV : ", scores.mean())

Mean score on a 5-fold CV :  0.8074999999999999


It also doesn't improve the result. And it takes much more time. We should also work with counts rather than text to optimize computational time !