# Final Project

*Luca Colombo*

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import pos_tag, bigrams
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import re


def read_files(path):
    #function to read the data
    #the chosen encoding has proven to be optimal given the input data
    with open(path, 'rb',) as file:
        data = file.read().decode('utf8', 'surrogateescape')
        data = data.replace('\n', '')
    return data


def senti_reader(sentiment, stemmer = False):
    #this function takes the sentiment as an argument and returns a list
    #with all train and test observations for the specific sentiment
    #the function relies on positive and negative reviews data being located
    #in separate folders
    out = []
    folder = os.path.join('review_polarity.v2/txt_sentoken/', sentiment)
    files = [os.path.join(folder, x) for x in sorted(os.listdir(folder)) if x[-4:len(x)] == '.txt']
    for path in files:
        data = read_files(path)
        #remove extra spaces, then tokenize
        review = word_tokenize(data.strip())
        #remove punctuation
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        clean_review = [regex.sub('', word) for word in review]
        #remove empty strings
        clean_review = list(filter(None, clean_review))
        if stemmer:
            stemmer = PorterStemmer()
            stemmed_review = [stemmer.stem(token) for token in clean_review]
            out.append(stemmed_review)
        else:
            out.append(clean_review)
    return out


def show_most_informative_features(vectorizer, classifier, n=10):
    #this function prints the top n most informative features for each class
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()  
    topn_pos_class = sorted(zip(classifier.feature_count_[1], feature_names),reverse=True)[:n]
    topn_neg_class = sorted(zip(classifier.feature_count_[0], feature_names),reverse=True)[:n]    

    print("Important words in positive reviews")
    for coef, feature in topn_pos_class:
        print(class_labels[1], coef, feature) 
    print("-----------------------------------------")
    print("Important words in negative reviews")
    for coef, feature in topn_neg_class:
        print(class_labels[0], coef, feature)


def filter_adj_adv(text):
    #this function, when given a single review, returns only words that
    #are either adjectives or adverbs
    tags = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    pos_tagged = pos_tag(text)
    result = [x[0] for x in pos_tagged if x[1] in tags]
    return(result)


def pos_neg_word_ratio(review, positive_words, negative_words):
    #this function returns the ratio of positive words over negative words
    #in a given review
    pos = 0
    neg = 0
    for word in review:
        if word in positive_words:
            pos += 1
        if word in negative_words:
            neg += 1
    if neg != 0:
        ratio = pos/neg
    else:
        ratio = pos
    return ratio

## Problem 1

In [2]:
#load in memory all positive and negative reviews
pos = senti_reader('pos')
neg = senti_reader('neg')

#perform train-test split
#notice that we have sorted reviews in list by filename
#so we can take first 900 as train and last 100 as test and we will
#have the required split ('cv9XX' reviews are in test)
train_pos = pos[0:900]
test_pos = pos[900:1000]
train_neg = neg[0:900]
test_neg = neg[900:1000]

#make train and test set, combining positive and negative reviews
train = train_pos + train_neg
test = test_pos + test_neg
#make labels for train and test
train_labels = [1] * 900 + [0] * 900
test_labels = [1] * 100 + [0] * 100

#small sanity check: want as many observations as labels
len(test_labels) == len(test)

True

#### Model 1

In [3]:
#get word counts
vectorizer = CountVectorizer(binary=True)
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

#get predictions
predictions = nb_clf.predict(test_features)

#evaluate classifier
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 86.0%

Important words in positive reviews
1 900.0 the
1 900.0 of
1 899.0 to
1 899.0 is
1 899.0 and
-----------------------------------------
Important words in negative reviews
0 899.0 the
0 899.0 of
0 899.0 and
0 898.0 to
0 898.0 is


#### Model 2

In [4]:
#get word counts
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

#get predictions
predictions = nb_clf.predict(test_features)

#evaluate classifier
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 82.5%

Important words in positive reviews
1 37055.0 the
1 17716.0 and
1 16623.0 of
1 14733.0 to
1 12891.0 is
-----------------------------------------
Important words in negative reviews
0 31363.0 the
0 13981.0 and
0 13798.0 of
0 13784.0 to
0 10408.0 is


#### Model 3

In [5]:
#only keep adjectives and adverbs
train_adj_adv = [filter_adj_adv(review) for review in train]
test_adj_adv = [filter_adj_adv(review) for review in test]

#get word counts
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform([' '.join(review) for review in train_adj_adv])
test_features = vectorizer.transform([' '.join(review) for review in test_adj_adv])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

#get predictions
predictions = nb_clf.predict(test_features)

#evaluate classifier
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 85.0%

Important words in positive reviews
1 2643.0 not
1 1660.0 nt
1 1632.0 more
1 1289.0 so
1 1247.0 most
-----------------------------------------
Important words in negative reviews
0 2416.0 not
0 2128.0 nt
0 1482.0 so
0 1391.0 just
0 1361.0 more


#### Model 4

In [6]:
#get word counts
vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, stop_words='english', sublinear_tf=True)
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

#get predictions
predictions = nb_clf.predict(test_features)

#evaluate classifier
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 86.5%

Important words in positive reviews
1 22.492825932912133 movie
1 18.509989709201456 like
1 17.841205636352782 does
1 16.896498279193935 story
1 16.830390064481563 just
-----------------------------------------
Important words in negative reviews
0 27.110438636527537 movie
0 21.182340191886198 like
0 19.920244154434403 just
0 18.74488216623521 bad
0 17.754498252923117 does


#### Model 5

In [7]:
#get word counts
vectorizer = CountVectorizer(binary=True, ngram_range=(2,2))
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

#get predictions
predictions = nb_clf.predict(test_features)

#evaluate classifier
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 84.0%

Important words in positive reviews
1 845.0 of the
1 786.0 in the
1 682.0 the film
1 657.0 to the
1 635.0 and the
-----------------------------------------
Important words in negative reviews
0 828.0 of the
0 796.0 in the
0 640.0 the film
0 619.0 to be
0 578.0 to the


Model 4 is the best performing model, model 2 is the worst performing. When looking at the 5 most important words in positive and negative reviews we notice that all models have a substantial overlap and that most of the words we see here are stop words.   
Moving away from the purely numerical evaluation of models based on accuracy, we see that no model is giving a strong and reliable series of the most important words in positive and negative reviews and all the models appear to not be performing well from this point of view.

## Problem 2

In [8]:
#load in memory all positive and negative reviews, this time use Porter's stemmer
pos_stem = senti_reader('pos', stemmer = True)
neg_stem = senti_reader('neg', stemmer = True)

#perform train-test split
#notice that we have sorted reviews in list by filename
#so we can take first 900 as train and last 100 as test and we will
#have the required split ('cv9XX' reviews are in test)
train_pos_stem = pos_stem[0:900]
test_pos_stem = pos_stem[900:1000]
train_neg_stem = neg_stem[0:900]
test_neg_stem = neg_stem[900:1000]

#make train and test set, combining positive and negative reviews
train_stem = train_pos_stem + train_neg_stem
test_stem = test_pos_stem + test_neg_stem
#make labels for train and test
train_labels_stem = [1] * 900 + [0] * 900
test_labels_stem = [1] * 100 + [0] * 100

#small sanity check: want as many observations as labels
len(test_labels_stem) == len(test_stem)

True

#### Model 1

In [9]:
#get word counts
vectorizer = CountVectorizer(binary=True)
train_features_stem = vectorizer.fit_transform([' '.join(review) for review in train_stem])
test_features_stem = vectorizer.transform([' '.join(review) for review in test_stem])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features_stem, train_labels_stem)

#get predictions
predictions = nb_clf.predict(test_features_stem)

#evaluate classifier
accuracy = accuracy_score(test_labels_stem, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 85.5%

Important words in positive reviews
1 900.0 the
1 900.0 of
1 899.0 to
1 899.0 is
1 899.0 and
-----------------------------------------
Important words in negative reviews
0 899.0 the
0 899.0 of
0 899.0 and
0 898.0 to
0 898.0 is


#### Model 2

In [10]:
#get word counts
vectorizer = CountVectorizer()
train_features_stem = vectorizer.fit_transform([' '.join(review) for review in train_stem])
test_features_stem = vectorizer.transform([' '.join(review) for review in test_stem])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features_stem, train_labels_stem)

#get predictions
predictions = nb_clf.predict(test_features_stem)

#evaluate classifier
accuracy = accuracy_score(test_labels_stem, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 83.5%

Important words in positive reviews
1 37055.0 the
1 17716.0 and
1 16627.0 of
1 14733.0 to
1 12891.0 is
-----------------------------------------
Important words in negative reviews
0 31363.0 the
0 13981.0 and
0 13799.0 of
0 13784.0 to
0 10408.0 is


#### Model 3

In [11]:
#only keep adjectives and adverbs
train_adj_adv_stem = [filter_adj_adv(review) for review in train_stem]
test_adj_adv_stem = [filter_adj_adv(review) for review in test_stem]

#get word counts
vectorizer = CountVectorizer()
train_features_stem = vectorizer.fit_transform([' '.join(review) for review in train_adj_adv_stem])
test_features_stem = vectorizer.transform([' '.join(review) for review in test_adj_adv_stem])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features_stem, train_labels_stem)

#get predictions
predictions = nb_clf.predict(test_features_stem)

#evaluate classifier
accuracy = accuracy_score(test_labels_stem, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 82.5%

Important words in positive reviews
1 2656.0 not
1 1880.0 hi
1 1633.0 more
1 1590.0 nt
1 1269.0 so
-----------------------------------------
Important words in negative reviews
0 2420.0 not
0 1976.0 nt
0 1449.0 so
0 1391.0 just
0 1368.0 hi


#### Model 4

In [12]:
#get word counts
vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, stop_words='english', sublinear_tf=True)
train_features_stem = vectorizer.fit_transform([' '.join(review) for review in train_stem])
test_features_stem = vectorizer.transform([' '.join(review) for review in test_stem])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features_stem, train_labels_stem)

#get predictions
predictions = nb_clf.predict(test_features_stem)

#evaluate classifier
accuracy = accuracy_score(test_labels_stem, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 84.5%

Important words in positive reviews
1 23.34833590332707 wa
1 21.172466772419387 charact
1 20.7167222993606 like
1 19.75065962931403 make
1 19.33995912700573 time
-----------------------------------------
Important words in negative reviews
0 25.890721206769395 wa
0 23.61681537014885 like
0 21.61590003839188 charact
0 21.10268885729731 just
0 19.952151004324193 bad


#### Model 5

In [13]:
#get word counts
vectorizer = CountVectorizer(binary=True, ngram_range=(2,2))
train_features_stem = vectorizer.fit_transform([' '.join(review) for review in train_stem])
test_features_stem = vectorizer.transform([' '.join(review) for review in test_stem])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features_stem, train_labels_stem)

#get predictions
predictions = nb_clf.predict(test_features_stem)

#evaluate classifier
accuracy = accuracy_score(test_labels_stem, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Accuracy is 83.5%

Important words in positive reviews
1 845.0 of the
1 786.0 in the
1 684.0 the film
1 657.0 to the
1 635.0 and the
-----------------------------------------
Important words in negative reviews
0 828.0 of the
0 796.0 in the
0 643.0 the film
0 621.0 to be
0 578.0 to the


Looking at the results, it seems like it is not worth to use the Porter stemmer. With the stemmer, the runtime is longer and the accuracy of the model does not change substantially (actually, it often decreases). 

## Problem 3

In [None]:
NRC = pd.read_csv('NRC_Emotion.txt', sep = '\t', names = ['TargetWord', 'AffectCategory', 'AssociationFlag'])
#only two categories are relevant: positive and negative
NRC = NRC[NRC.AffectCategory.isin(['positive', 'negative'])]
#only words that are associated with affect category are relevant
NRC = NRC[NRC.AssociationFlag==1]
positive_words = list(NRC.loc[NRC.AffectCategory == 'positive', 'TargetWord'])
negative_words = list(NRC.loc[NRC.AffectCategory == 'negative', 'TargetWord'])
print(len(positive_words))
print(len(negative_words))

2312
3324


In [None]:
#get ratios in train and test data
train_ratios = [pos_neg_word_ratio(review, positive_words, negative_words) for review in train]
test_ratios = [pos_neg_word_ratio(review, positive_words, negative_words) for review in test]

#train Naive Bayes classifier
nb_clf = MultinomialNB()
nb_clf.fit(np.array(train_ratios).reshape(-1, 1), train_labels)

#get predictions
predictions = nb_clf.predict(np.array(test_ratios).reshape(-1, 1))

#evaluate classifier
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

This is by far the worst performing model. One possible explanation is to be found in the size of the dictionary. We only have 2312 positive words and 3324 negative words; moreover there is no guarantee that these words will cover the terms normally used to review a movie. This could be one of the factors contributing to the poor performance of this model.

## Problem 4

I will focus on Model 1 only, however most of the considerations presented here can be applied to the other models to try and improve their performance. 

In [None]:
updated_SW = list(stopwords.words('english'))
updated_SW

updated_SW.append("film")
updated_SW.append("movie")
updated_SW.append("character")
updated_SW.append("characters")
updated_SW.append("plot")
updated_SW.append("story")
updated_SW.append("time")
updated_SW.append("like")
updated_SW.append("would")
updated_SW.append("could")

In [None]:
#get word counts
vectorizer = CountVectorizer(binary=True, stop_words=updated_SW)
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

#get predictions
predictions = nb_clf.predict(test_features)

#evaluate classifier
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

Intuition suggests that removing stopwords would help the performance of the model. Surprisingly, this is not the case, as accuracy drops after removing stopwords. Marginal improvements happen when we start implementing a personalized list of stop words that focuses on words that often found in movie reviews and do not have implications on the sentiment of the movie.

In [None]:
#get word counts
vectorizer = CountVectorizer(binary=True, max_df=0.75, min_df=5)
train_features = vectorizer.fit_transform([' '.join(review) for review in train])
test_features = vectorizer.transform([' '.join(review) for review in test])

#train Naive Bayes classifier 
nb_clf = MultinomialNB()
nb_clf.fit(train_features, train_labels)

#get predictions
predictions = nb_clf.predict(test_features)

#evaluate classifier
accuracy = accuracy_score(test_labels, predictions)
print('Accuracy is {}%\n'.format(accuracy*100))

# print most informative words
show_most_informative_features(vectorizer, nb_clf, 5)

We can improve the performance of the model by tuning the `max_df` and `min_df` parameters.   
`max_df = 0.75` allows to automatically detect and filter stop words based on intra corpus document frequency of terms. When building the vocabulary, terms that have a document frequency strictly higher than 75% are ignored.   
`min_df = 5` allows to remove infrequent words. When building the vocabulary, terms that appear in less than 5 documents are ignored.

Normalization could also help improve the performance of this model, as the length of reviews varies drastically across different people.