In [None]:
# basically make functions for every feature
# then do the leave one out thing: call all functions but one

In [1]:
import xml.etree.ElementTree as ET
import gzip
import numpy as np
import pandas as pd
import tarfile
import collections
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import *
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
# initializing list of uncleaned reviews, list of labels (neg or pos) according to which folder the file is in

list_of_reviews_unclean = []
list_of_labels = []
tar = tarfile.open("../Data/review_polarity.tar.gz", "r:gz")
counter = 0

# looping through folder, 
# putting all review strings in list_of_reviews_unclean & all labels in another list_of_labels
for member in tar.getmembers():
    # checking that it's a normal txt file
    if member.isreg():
        temp_array = (member.name).split("/")
        # appending review string to list_of_reviews
        # adding folder name from file path to list_of_labels
        if (len(temp_array) >= 2):
            list_of_labels.append(temp_array[1])
            f = tar.extractfile(member)
            f_str = f.read().lower().decode("utf-8")
            list_of_reviews_unclean.append(f_str)






In [3]:
# function created to check whether a token is a number
def check_if_num(s):
    if re.search(r'\d', s):
        return True
    return False


In [4]:
print(check_if_num("___"))

False


In [5]:
#  CLEANING and PREPROCESSING THE DATA
#  1. Removing punctuation
#  2. Using PorterStemmer for stemming
#  3. Removing stop words
#  4. Removing words with length <= 2

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
list_of_reviews = []

for review in list_of_reviews_unclean:
    word_tokens = tokenizer.tokenize(review)
    # remove punctuation, stemming, removing stop words, removing words with length <= 2
    preprocessed_sentence = [stemmer.stem(w.strip("_")) for w in word_tokens if ((not w.lower() in stop_words) and (len(w)>2) and (check_if_num(w)==False) and (any([c!='_' for c in w]))  )]
    list_of_reviews.append(' '.join(preprocessed_sentence))

    
    
    

In [6]:
## Using CountVectorizer
analyzer=stemmed_words
CountVec = CountVectorizer(ngram_range=(1,1))
X = CountVec.fit_transform(list_of_reviews)
cv_dataframe=pd.DataFrame(X.toarray(), columns=CountVec.get_feature_names_out())
print((CountVec.get_feature_names_out()))

['a_night_at_the_roxburi' 'aaa' 'aaaaaaaaah' ... 'zwigoff' 'zyci'
 'zzzzzzz']


In [7]:
# Function borrowed from TA demo to take as input training and testing vectors and labels

def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=0)

    return f1, precision, recall




In [8]:
########### GAUSSIAN NAIVE BAYES ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy 
f1, precision, recall = buildClassifiers(GaussianNB(), X_train, X_test, y_train, y_test )
print("GAUSSIAN NAIVE BAYES: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)

GAUSSIAN NAIVE BAYES: 
	f1:  0.635947576451009
	precision:  0.6363927650234704
	recall:  0.6362356179289818
	accuracy:  0.636


In [9]:
########### Decision Tree Classifier ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Decision Tree Classifier 
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state=0, criterion='gini')
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(DecisionTreeClassifier(random_state=0, criterion='gini'), X_train, X_test, y_train, y_test )
print("DECISION TREE CLASSIFIER: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)

DECISION TREE CLASSIFIER: 
	f1:  0.617876191886171
	precision:  0.6184463222584982
	recall:  0.6187370650238244
	accuracy:  0.618


In [10]:
########### SGD Classifier ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# SGD Classifier 
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss = "log_loss")
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(SGDClassifier(loss = "log_loss"), X_train, X_test, y_train, y_test )
print("SGD CLASSIFIER: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)

SGD CLASSIFIER: 
	f1:  0.8357871801855206
	precision:  0.8363895974715632
	recall:  0.8356142456982794
	accuracy:  0.832


In [11]:
########### Logistic Regression ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Logistic Regression 
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver = "saga")
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(LogisticRegression(solver = "saga"), X_train, X_test, y_train, y_test )
print("LOGISTIC REGRESSION: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)



LOGISTIC REGRESSION: 
	f1:  0.8238195912614518
	precision:  0.824187627246349
	recall:  0.8236894757903162
	accuracy:  0.824




In [12]:
########### PERCEPTRON ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Perceptron 
from sklearn.linear_model import Perceptron

classifier = Perceptron()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(Perceptron(), X_train, X_test, y_train, y_test )
print("PERCEPTRON: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)

PERCEPTRON: 
	f1:  0.7982927498346001
	precision:  0.7987878787878788
	recall:  0.7979019964872138
	accuracy:  0.8
