In [None]:
from os import listdir
from os.path import isfile, join
import numpy as np
from mosestokenizer import *
from tqdm import tqdm
from joblib import Parallel, delayed
from nltk.stem import WordNetLemmatizer


import warnings
warnings.filterwarnings('ignore')

In [None]:
# AUXILIARY FUNCTIONS

def getFilesInFolder(mypath):
     return [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]
    

def getReview(path):
    review = ""
    with open(path, 'r') as rev:
        for line in rev:
            review += line[:-1]
    with MosesTokenizer('en') as tokenize:
        return tokenize(review)

### Reading necessary files

In [None]:
# READING LEXICON

positive_lexicon_path = "lexicon/positive-words.txt"
negative_lexicon_path = "lexicon/negative-words.txt"

positive_words = []
negative_words = []

# saving each word into a list, reading each line, and discarding the
# newline \n

with open(positive_lexicon_path, 'r', encoding='ISO-8859-1') as pos:
    for line in pos:
        positive_words.append(line[:-1])
        
with open(negative_lexicon_path, 'r', encoding='ISO-8859-1') as neg:
    for line in neg:
        negative_words.append(line[:-1])

In [None]:
# READING REVIEWS

positive_reviews_path = "reviews/pos/"
negative_reviews_path = "reviews/neg/"

positive_reviews = []
negative_reviews = []

# for each positive file
for path in tqdm(getFilesInFolder(positive_reviews_path)):
    positive_reviews.append(getReview(path))

# for each negative file
for path in tqdm(getFilesInFolder(negative_reviews_path)):
    negative_reviews.append(getReview(path))
    
positive_reviews = np.array(positive_reviews)
negative_reviews = np.array(negative_reviews)

In [None]:
# RANDOMLY SELECT 400 reviews as TEST SET

indexes = [True]*600+[False]*400
train_indexes = np.random.choice(indexes, 1000, replace=False)
test_indexes = np.invert(train_indexes)

pos_train = positive_reviews[train_indexes]
neg_train = negative_reviews[train_indexes]


pos_test = positive_reviews[test_indexes]
neg_test = negative_reviews[test_indexes]

In [None]:
# CLASSIFER BY WORD COUNTS

def classifySentimentByCounting(review):
    positives = 0
    negatives = 0
    
    for word in review:
        if word in positive_words:
            positives += 1
        elif word in negative_words:
            negatives += 1
    return positives >= negatives        

In [None]:
# KEEPING TRACK OF TP, FP, TN and FN, ON THE TEST SET, AND USING THEM FOR THE F-SCORE EVALUATION

pos_results = Parallel(n_jobs=-1)(delayed(classifySentimentByCounting)(review) for review in tqdm(pos_test))
neg_results = Parallel(n_jobs=-1)(delayed(classifySentimentByCounting)(review) for review in tqdm(neg_test))

TP = sum(pos_results)
FP = len(pos_results) - TP

FN = sum(neg_results)
TN = len(neg_results) - FN


In [None]:
# EVALUATING F-SCORE & ACCURACY
accuracy = (TP+TN) / (TP+TN+FN+FP)
f_score = TP / (TP + 0.5*(FP+FN))
print("Accuracy on test set: {0:.4f}".format(accuracy))
print("F-score on test set: {0:.4f}".format(f_score))

## Running with 10 different splits to obtain a more valid estimate

In [None]:
# RANDOMLY SELECT 400 reviews as TEST SET

def runCountingClassification(pos_train, neg_train, pos_test, neg_test):
    
    # CLASSIFER BY WORD COUNTS

    # KEEPING TRACK OF TP, FP, TN and FN, ON THE TEST SET, AND USING THEM FOR THE F-SCORE EVALUATION

    pos_results = Parallel(n_jobs=-1)(delayed(classifySentimentByCounting)(review) for review in tqdm(pos_test))
    neg_results = Parallel(n_jobs=-1)(delayed(classifySentimentByCounting)(review) for review in tqdm(neg_test))

    TP = sum(pos_results)
    FP = len(pos_results) - TP

    FN = sum(neg_results)
    TN = len(neg_results) - FN   

    accuracy = (TP+TN) / (TP+TN+FN+FP)
    f_score = TP / (TP + 0.5*(FP+FN))
    print("Accuracy count: \t{:.4f}".format(accuracy))
    print("F1-score: \t{:.4f}".format(f_score))
    return accuracy, f_score

In [None]:
accuracies = []
f_scores = []

for i in range(10):
    print("Run {}: ".format(i), end='')
    acc, f1 = runCountingClassification()
    accuracies.append(acc)
    f_scores.append(f1)

In [None]:
print("Accuracy on test set: {:.4f} +/- {:.4f}".format(np.mean(accuracies), np.std(accuracies)))
print("F-score on test set: {:.4f} +/- {:.4f}".format(np.mean(f_scores), np.std(f_scores)))

# Task (ii): Logistic Regression

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# ADAPTING the data for the regressor, creating labels

def runLogisticRegression(pos_train, neg_train, pos_test, neg_test):
    
    X_train = np.concatenate([pos_train, neg_train])
    y_train = [1]*len(pos_train) + [0]*len(neg_train)

    X_test = np.concatenate([pos_test, neg_test])
    y_test = [1]*len(pos_test) + [0]*len(neg_test)
    # training the vectorizer and model on test data

    def dummy(doc):
        return doc

    vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy)
    vectorizer.fit(X_train)

    clf = LogisticRegression(random_state=42).fit(vectorizer.transform(X_train), y_train)

    y_pred = clf.predict(vectorizer.transform(X_test))
    print("\tAccuracy on test: {:.4f}".format(accuracy_score(y_test, y_pred)))
    print("\tF1-score on test: {:.4f}".format(f1_score(y_test, y_pred)))
    
    return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)
   

In [None]:
accuracies = []
f_scores = []

for i in range(10):
    
    print("Run {}: ".format(i))
    acc, f1 = runLogisticRegression()
    accuracies.append(acc)
    f_scores.append(f1)
    
print("Accuracy on test set: {:.4f} +/- {:.4f}".format(np.mean(accuracies), np.std(accuracies)))
print("F-score on test set: {:.4f} +/- {:.4f}".format(np.mean(f_scores), np.std(f_scores)))

# Paired permutation test

In [None]:
accuracy_pairs = []
f1_score_pairs = []

for i in range(25):
       
    indexes = [True]*600+[False]*400
    train_indexes = np.random.choice(indexes, 1000, replace=False)
    test_indexes = np.invert(train_indexes)

    pos_train = positive_reviews[train_indexes]
    neg_train = negative_reviews[train_indexes]


    pos_test = positive_reviews[test_indexes]
    neg_test = negative_reviews[test_indexes]
    
    print("Counting", i)
    acc_count, f1_count = runCountingClassification(pos_train, neg_train, pos_test, neg_test)
    acc_log, f1_log = runLogisticRegression(pos_train, neg_train, pos_test, neg_test)
    print("Logistic", i)
    accuracy_pairs.append([acc_count, acc_log])
    f1_score_pairs.append([f1_count, f1_log])   
    


In [None]:
accuracy_pairs = np.array(accuracy_pairs)
f1_score_pairs = np.array(f1_score_pairs)

print(accuracy_pairs.shape)

print("F1-score count:\t {:.4f}".format(f1_score_pairs[0,0]))
print("Accuracy count:\t {:.4f}".format(accuracy_pairs[0,0]))

print("F1-score log:\t {:.4f}".format(f1_score_pairs[0,1]))
print("Accuracy log:\t {:.4f}".format(accuracy_pairs[0,1]))

print("\n")

print("F1-score count:\t {:.4f} +/- {:.4f}".format(np.mean(f1_score_pairs[:,0]), np.std(f1_score_pairs[:,0])))
print("Accuracy count:\t {:.4f} +/- {:.4f}".format(np.mean(accuracy_pairs[:,0]), np.std(accuracy_pairs[:,0])))

print("F1-score log:\t {:.4f} +/- {:.4f}".format(np.mean(f1_score_pairs[:,1]), np.std(f1_score_pairs[:,1])))
print("Accuracy log:\t {:.4f} +/- {:.4f}".format(np.mean(accuracy_pairs[:,1]), np.std(accuracy_pairs[:,1])))


In [None]:
# average difference in accuracy

mean_diff_acc = np.mean(accuracy_pairs[:,1]-accuracy_pairs[:,0])
mean_diff_fscore = np.mean(np.array(f1_score_pairs[:,1]-f1_score_pairs[:,0]))

n_acc = 0
n_f1 = 0

n_permutations = 1000000

# null hypothesis: they are equal

acc_diff = accuracy_pairs[:,1]-accuracy_pairs[:,0]
f1_diff = f1_score_pairs[:,1]-f1_score_pairs[:,0]

for i in tqdm(range(n_permutations)):
    
    #shuffle
    shuffle_indexes = np.random.choice([1, -1], len(accuracy_pairs), replace=True)
    
    
    if abs(np.mean(acc_diff*shuffle_indexes)) >= abs(mean_diff_acc):
        n_acc += 1
        
    if abs(np.mean(f1_diff*shuffle_indexes)) >= abs(mean_diff_fscore):
        n_f1 += 1

p_val_acc = (n_acc+mean_diff_acc)/(n_permutations+1)
p_val_f1 = (n_f1+mean_diff_fscore)/(n_permutations+1)

print("Accuracy p-value on {} permutations:".format(n_permutations), p_val_acc)
print("F1 score p-value on {} permutations:".format(n_permutations), p_val_f1)