In [5]:
import xml.etree.ElementTree as ET
import gzip
import numpy as np
import pandas as pd
import tarfile
import collections
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import *
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score

In [6]:
list_of_reviews_unclean = []
list_of_labels = []
tar = tarfile.open("../Data/review_polarity.tar.gz", "r:gz")
counter = 0

# looping through folder, 
# putting all review strings in list_of_reviews_unclean & all labels in another list_of_labels
for member in tar.getmembers():
    # checking that it's a normal txt file
    if member.isreg():
        temp_array = (member.name).split("/")
        if (len(temp_array) >= 2):
            list_of_labels.append(temp_array[1])
            f = tar.extractfile(member)
            f_str = f.read().lower().decode("utf-8")
            list_of_reviews_unclean.append(f_str)

In [7]:
def check_if_num(s):
    if re.search(r'\d', s):
        return True
    return False

In [8]:
#  CLEANING and PREPROCESSING THE DATA
#  1. Removing punctuation
#  2. Using PorterStemmer for stemming
#  3. Removing stop words
#  4. Removing words with length <= 2

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
list_of_reviews = []

for review in list_of_reviews_unclean:
    word_tokens = tokenizer.tokenize(review)
    preprocessed_sentence = [stemmer.stem(w.strip("_")) for w in word_tokens if ((not w.lower() in stop_words) and (len(w)>2) and (check_if_num(w)==False) and (any([c!='_' for c in w]))  )]
    list_of_reviews.append(' '.join(preprocessed_sentence))




In [9]:
## Using CountVectorizer
analyzer=stemmed_words
CountVec = CountVectorizer(ngram_range=(3,3))
X = CountVec.fit_transform(list_of_reviews)
# print(CountVec.get_feature_names_out())
cv_dataframe=pd.DataFrame(X.toarray(), columns=CountVec.get_feature_names_out())
print((CountVec.get_feature_names_out()))

['a_night_at_the_roxburi left exactli'
 'a_night_at_the_roxburi take alreadi' 'aaa minor leagu' ...
 'zyci master beauti' 'zyci zyci master' 'zzzzzzz critiqu pretti']


In [10]:
# Function borrowed from TA demo to take as input training and testing vectors and labels

def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=0)

    return f1, precision, recall

In [11]:
########### GAUSSIAN NAIVE BAYES ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy 
f1, precision, recall = buildClassifiers(GaussianNB(), X_train, X_test, y_train, y_test )
print("GAUSSIAN NAIVE BAYES: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)



GAUSSIAN NAIVE BAYES: 
	f1:  0.6979891276085939
	precision:  0.6980285161063193
	recall:  0.698
	accuracy:  0.698


In [12]:
########### SGD Classifier ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# SGD Classifier 
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss = "log_loss")
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(SGDClassifier(loss = "log_loss"), X_train, X_test, y_train, y_test )
print("SGD CLASSIFIER: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)



SGD CLASSIFIER: 
	f1:  0.6516300473135005
	precision:  0.6667005540588624
	recall:  0.6574386711686483
	accuracy:  0.634


In [13]:
########### PERCEPTRON ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Perceptron 
from sklearn.linear_model import Perceptron

classifier = Perceptron()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(Perceptron(), X_train, X_test, y_train, y_test )
print("PERCEPTRON: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)

PERCEPTRON: 
	f1:  0.6606060606060606
	precision:  0.6633553362525325
	recall:  0.6604712378017463
	accuracy:  0.664
