## Baseline

In [63]:
import tarfile
import nltk
import sys
import random
import numpy as np
import sklearn
import os
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.metrics import precision_recall_fscore_support, f1_score, precision_score, recall_score, accuracy_score
import string
import re
from sklearn.model_selection import train_test_split
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.metrics import *
import collections

In [64]:
polaritytar = tarfile.open("../Data/review_polarity.tar.gz", "r")
polaritytar.extractall('../Data/Polarity_Data')

In [65]:
# lines negative reviews
neglines = []
for nfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/neg'):
    openFile = open(('../Data/Polarity_Data/txt_sentoken/neg/' + nfilename),"r")
    neglines = openFile.readlines()

# lines positive reviews
poslines = []
for pfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/pos'):
    openFile = open(('../Data/Polarity_Data/txt_sentoken/pos/' + pfilename),"r")
    poslines = openFile.readlines()
    

In [66]:
# tokens positive reviews
poslines_tokens = []
for line in poslines:
    l = []
    for word in line.split():
        l.append(word)
    poslines_tokens.append(l)
    
# tokens negative reviews
neglines_tokens = []
for line in neglines:
    l = []
    for word in line.split():
        l.append(word)
    neglines_tokens.append(l)

In [67]:
### helper function to remove non-alphanumeric characters and lowercase each token ###
def clean_tokens(tokens):
    cleaned_tokens = []
    
    for token in tokens:
        cleaned_tokens.append(token.lower())

    return cleaned_tokens

In [68]:
positive_cleaned_tokens = []
negative_cleaned_tokens = []

for tokens in poslines_tokens:
    positive_cleaned_tokens.append(clean_tokens(tokens))

for tokens in neglines_tokens:
    negative_cleaned_tokens.append(clean_tokens(tokens))

In [69]:
def create_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)

In [70]:
positive_tokens_for_model = create_model(positive_cleaned_tokens)
negative_tokens_for_model = create_model(negative_cleaned_tokens)

In [71]:
positive_dataset = [(t,"Positive") for t in positive_tokens_for_model]
negative_dataset = [(t,"Negative") for t in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset


In [72]:
np_dataset = np.array(dataset)

In [73]:
# use k-fold cross validation with k = 10 to train and test
kfold = KFold(n_splits=10, shuffle=True, random_state=1)
mean_accuracy, mean_precision = list(), list()

for train_i, test_i in kfold.split(np_dataset):
    
    xtrain, xtest = np_dataset[train_i], np_dataset[test_i]
    classifier = NaiveBayesClassifier.train(xtrain)

    mean_accuracy.append(classify.accuracy(classifier, xtest))
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(xtest):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)   

    mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
    mean_precision.append(precision(refsets['Negative'], testsets['Negative']))

# print the mean accuracy across all the folds
print("Accuracy:", np.mean(mean_accuracy))
print("Precision:", np.mean(mean_precision))


Accuracy: 0.8272727272727274
Precision: 0.8582738095238096


## Improved

In [74]:
import tarfile
import collections
import nltk
import sys
import random
import string
import re
import os
import numpy as np
import sklearn
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from nltk import classify
from nltk.classify import SklearnClassifier
from nltk import NaiveBayesClassifier, DecisionTreeClassifier
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.metrics.scores import precision, recall
from nltk.stem import WordNetLemmatizer

In [75]:
polaritytar = tarfile.open("../Data/review_polarity.tar.gz", "r")
polaritytar.extractall('../Data/Polarity_Data')

nrctar = tarfile.open("../Data/NRC-Sentiment-Emotion-Lexicons.tar.gz", 'r')
nrctar.extractall('../Data/NRC_Data')

In [76]:
# lines negative reviews
neglines = []
for nfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/neg'):
    open_file = open(('../Data/Polarity_Data/txt_sentoken/neg/' + nfilename),"r")
    neglines = open_file.readlines()

# lines positive reviews
poslines = []
for pfilename in os.listdir('../Data/Polarity_Data/txt_sentoken/pos'):
    open_file = open(('../Data/Polarity_Data/txt_sentoken/pos/' + pfilename),"r")
    poslines = open_file.readlines()
    

In [77]:
intensity_file = open('../Data/NRC_Data/NRC-Sentiment-Emotion-Lexicons/NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
intensity_lines = intensity_file.readlines()

In [78]:
lemmatizer = WordNetLemmatizer()

In [79]:
word_emotions = dict()
for line in intensity_lines[1:]:
    features = line.split()
    if features[2] == '1':
        word_emotion = (lemmatizer.lemmatize(features[0]), features[1])
        word_emotions.update({word_emotion})

In [80]:
# tokens positive reviews
poslines_tokens = []
for line in poslines:
    l = []
    for word in line.split():
        l.append(word)
    poslines_tokens.append(l)

# tokens negative reviews
neglines_tokens = []
for line in neglines:
    l = []
    for word in line.split():
        l.append(word)
    neglines_tokens.append(l)

In [81]:
def clean_tokens(tokens):

    cleaned_tokens = []
    for token in tokens:
        if len(token) != 0 and token not in string.punctuation: 
            cleaned_tokens.append(lemmatizer.lemmatize(token.lower()))
    return cleaned_tokens

In [82]:
positive_cleaned_tokens = []
negative_cleaned_tokens = []

for tokens in poslines_tokens:
    positive_cleaned_tokens.append(clean_tokens(tokens))

for tokens in neglines_tokens:
    negative_cleaned_tokens.append(clean_tokens(tokens))

In [83]:
def create_model(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tokens)

In [84]:
positive_tokens_for_model = create_model(positive_cleaned_tokens)
negative_tokens_for_model = create_model(negative_cleaned_tokens)

In [85]:
positive_dataset = [(t,"Positive") for t in positive_tokens_for_model]
negative_dataset = [(t,"Negative") for t in negative_tokens_for_model]


In [86]:
positive_emotions = ['positive', 'anticipation', 'joy', 'surprise', 'trust']
negative_emotions = ['anger', 'disgust', 'fear', 'negative', 'sadness']

In [87]:
pos_to_remove = list()
for (review, sentiment) in positive_dataset:
    for word in review:
        if word in word_emotions and word_emotions[word] in negative_emotions:
            pos_to_remove.append(word)

for (review, sentiment) in positive_dataset:
    for neg_word in pos_to_remove:
        if neg_word in review.keys():
            review.pop(neg_word)


In [88]:
dataset = positive_dataset + negative_dataset


In [89]:
np_dataset = np.array(dataset)

In [90]:
kfold = KFold(n_splits=9, shuffle=True, random_state=1)
nb_mean_accuracy, dt_mean_accuracy, bern_mean_accuracy = list(), list(), list()
nb_mean_precision, dt_mean_precision, bern_mean_precision = list(), list(), list()

for train, test in kfold.split(np_dataset):
    # naive bayes classifier
    nb_classifier = NaiveBayesClassifier.train(np_dataset[train])
    nb_mean_accuracy.append(classify.accuracy(nb_classifier, np_dataset[test]))

    # decitions tree classifier
    dt_classifier = DecisionTreeClassifier.train(np_dataset[train])
    dt_mean_accuracy.append(classify.accuracy(dt_classifier, np_dataset[test]))

    # bernoulli classifier
    bern_classifier = SklearnClassifier(BernoulliNB()).train(np_dataset[train])
    bern_mean_accuracy.append(classify.accuracy(bern_classifier, np_dataset[test]))
    
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    classifiers = [nb_classifier, dt_classifier, bern_classifier]

    for classifier in classifiers:
        for i, (feats, label) in enumerate(np_dataset[test]):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)   

        if classifier == nb_classifier:
            nb_mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
            nb_mean_precision.append(precision(refsets['Negative'], testsets['Negative']))

        elif classifier == dt_classifier:
            dt_mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
            dt_mean_precision.append(precision(refsets['Negative'], testsets['Negative']))

        elif classifier == bern_classifier:
            bern_mean_precision.append(precision(refsets['Positive'], testsets['Positive']))
            bern_mean_precision.append(precision(refsets['Negative'], testsets['Negative']))

# print the mean accuracy across all the folds for each classifier
print("Naive Bayes accuracy:", np.mean(nb_mean_accuracy))
print("Naive Bayes precision:", np.mean(nb_mean_precision))
print('\n')
print("Decision Tree accuracy:", np.mean(dt_mean_accuracy))
print("Decision Tree precision:", np.mean(dt_mean_precision))
print('\n')
print("Bernoulli accuracy:", np.mean(bern_mean_accuracy))
print("Bernoulli precision:", np.mean(bern_mean_precision))



Naive Bayes accuracy: 0.8552188552188551
Naive Bayes precision: 0.8728174603174602


Decision Tree accuracy: 0.7777777777777778
Decision Tree precision: 0.741247795414462


Bernoulli accuracy: 0.8367003367003368
Bernoulli precision: 0.7279982363315696
