In [2]:
import xml.etree.ElementTree as ET
import gzip
import numpy as np
import pandas as pd
import tarfile
import collections
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import *
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')
from sklearn.metrics import f1_score, precision_score, recall_score

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nehagogate/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
list_of_reviews_unclean = []
list_of_labels = []
tar = tarfile.open("../Data/review_polarity.tar.gz", "r:gz")
counter = 0

# looping through folder, 
# putting all review strings in list_of_reviews_unclean & all labels in another list_of_labels
for member in tar.getmembers():
    # checking that it's a normal txt file
    if member.isreg():
        temp_array = (member.name).split("/")
        if (len(temp_array) >= 2):
            list_of_labels.append(temp_array[1])
            f = tar.extractfile(member)
            f_str = f.read().lower().decode("utf-8")
            list_of_reviews_unclean.append(f_str)



In [4]:
def check_if_num(s):
    if re.search(r'\d', s):
        return True
    return False



In [5]:
#  CLEANING and PREPROCESSING THE DATA
#  1. Removing punctuation
#  2. Using PorterStemmer for stemming
#  3. Removing stop words
#  4. Removing words with length <= 2

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
list_of_reviews = []

for review in list_of_reviews_unclean:
    word_tokens = tokenizer.tokenize(review)
    # print(pos_tag(word_tokens))
    # preprocessed_sentence = [stemmer.stem(w.strip("_")) for w in word_tokens if ((not w.lower() in stop_words) and (len(w)>2) and (check_if_num(w)==False) and (any([c!='_' for c in w]))  )]
    preprocessed_sentence = [w[1] for w in (pos_tag(word_tokens))]
    list_of_reviews.append(' '.join(preprocessed_sentence))

# for r in (list_of_reviews[100:900]):
#     print(r,"\n")
    
    
    

In [15]:
## Using CountVectorizer
analyzer=stemmed_words
CountVec = CountVectorizer(ngram_range=(2,2))
X = CountVec.fit_transform(list_of_reviews)
# print(CountVec.get_feature_names_out())
cv_dataframe=pd.DataFrame(X.toarray(), columns=CountVec.get_feature_names_out())
print((CountVec.get_feature_names_out()))

['cc cc' 'cc cd' 'cc dt' 'cc ex' 'cc fw' 'cc in' 'cc jj' 'cc jjr' 'cc jjs'
 'cc md' 'cc nn' 'cc nnp' 'cc nns' 'cc pdt' 'cc prp' 'cc rb' 'cc rbr'
 'cc rbs' 'cc rp' 'cc to' 'cc uh' 'cc vb' 'cc vbd' 'cc vbg' 'cc vbn'
 'cc vbp' 'cc vbz' 'cc wdt' 'cc wp' 'cc wrb' 'cd cc' 'cd cd' 'cd dt'
 'cd ex' 'cd in' 'cd jj' 'cd jjr' 'cd jjs' 'cd md' 'cd nn' 'cd nns'
 'cd pdt' 'cd prp' 'cd rb' 'cd rbr' 'cd rbs' 'cd rp' 'cd to' 'cd vb'
 'cd vbd' 'cd vbg' 'cd vbn' 'cd vbp' 'cd vbz' 'cd wdt' 'cd wp' 'cd wrb'
 'dt cc' 'dt cd' 'dt dt' 'dt ex' 'dt fw' 'dt in' 'dt jj' 'dt jjr' 'dt jjs'
 'dt md' 'dt nn' 'dt nnp' 'dt nnps' 'dt nns' 'dt pdt' 'dt prp' 'dt rb'
 'dt rbr' 'dt rbs' 'dt rp' 'dt to' 'dt vb' 'dt vbd' 'dt vbg' 'dt vbn'
 'dt vbp' 'dt vbz' 'dt wdt' 'dt wp' 'dt wrb' 'ex cc' 'ex cd' 'ex dt'
 'ex ex' 'ex in' 'ex jj' 'ex md' 'ex nn' 'ex nns' 'ex pdt' 'ex prp'
 'ex rb' 'ex rbs' 'ex to' 'ex vb' 'ex vbd' 'ex vbg' 'ex vbn' 'ex vbp'
 'ex vbz' 'ex wdt' 'ex wp' 'ex wrb' 'fw cc' 'fw cd' 'fw dt' 'fw fw'
 'fw in' 'fw jj' 

In [16]:
# Function borrowed from TA demo to take as input training and testing vectors and labels
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=0)

    return f1, precision, recall

In [17]:
########### GAUSSIAN NAIVE BAYES ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(GaussianNB(), X_train, X_test, y_train, y_test )
print("GAUSSIAN NAIVE BAYES: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)

GAUSSIAN NAIVE BAYES: 
	f1:  0.5882062263218579
	precision:  0.5909017046100846
	recall:  0.5895097286226318
	accuracy:  0.59


In [18]:
########### Decision Tree Classifier ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Decision Tree Classifier 
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(random_state=0, criterion='gini')
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(DecisionTreeClassifier(random_state=0, criterion='gini'), X_train, X_test, y_train, y_test )
print("DECISION TREE CLASSIFIER: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)

DECISION TREE CLASSIFIER: 
	f1:  0.5456928883925534
	precision:  0.5473925299506694
	recall:  0.5476804331362091
	accuracy:  0.546


In [19]:
########### SGD Classifier ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# SGD Classifier 
from sklearn.linear_model import SGDClassifier
classifier = SGDClassifier(loss = "log_loss")
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(SGDClassifier(loss = "log_loss"), X_train, X_test, y_train, y_test )
print("SGD CLASSIFIER: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)

SGD CLASSIFIER: 
	f1:  0.6177045280774786
	precision:  0.6313920454545454
	recall:  0.6243697478991597
	accuracy:  0.63


In [20]:
########### Logistic Regression ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Logistic Regression 
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver = "saga")
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(LogisticRegression(solver = "saga"), X_train, X_test, y_train, y_test )
print("LOGISTIC REGRESSION: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)



LOGISTIC REGRESSION: 
	f1:  0.657998631994528
	precision:  0.6580025280404487
	recall:  0.658
	accuracy:  0.654




In [21]:
########### PERCEPTRON ###########

y = list_of_labels

# split train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y)

# Perceptron 
from sklearn.linear_model import Perceptron

classifier = Perceptron()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

# Calculating and printing f1, precision, recall, accuracy
f1, precision, recall = buildClassifiers(Perceptron(), X_train, X_test, y_train, y_test )
print("PERCEPTRON: ")
print("\tf1: ", f1)
print("\tprecision: ", precision)
print("\trecall: ", recall)
print("\taccuracy: ", accuracy)

PERCEPTRON: 
	f1:  0.613138321769081
	precision:  0.6483372119210269
	recall:  0.6264720942140297
	accuracy:  0.628
