Logan Tucker
Task 1 : Binary Classification
CS74 
S23

# Prep Phase
    step 1) load in data 
    step 2) classify reviews for each cutoff
    step 3) clean data
    step 4) vectorize

Load in data
***************************************************************************************************************************************************************************

In [51]:
import pandas as pd
import numpy as np

# upload Training.csv and Test.csv 
training = pd.read_csv('Training.csv')
test = pd.read_csv('Test.csv')

training.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style,category
0,1,False,"11 12, 2016",C413C78E10E54C5DB41837889F36C1E8,565D194F38B1CC3F806EE677C61F639C,465E154EC79AFFAB5EB2607198B21433,all of the reviews for this product are fake.,"All fake reviews, beware.",1478908800,2.0,,{'Size:': ' Polaris H4'},automotive
1,1,True,"12 6, 2016",490AE37808EFEE3AF4FE6DEBDEB5A4C8,0D66512A0A7F580523AB996378DF0F14,760C63E8E5E8DC3FAA01878D37BA5678,wrong part. our fault.,One Star,1480982400,,,,automotive
2,1,True,"09 17, 2014",74A9FA5A64449BEE2A2E8E3F62872F0F,A0E45600FF2C5A779CB4314F379C253A,C6E4DD5C1C4EC09E90182644ED6CA9EF,this wire set it really sucks!!!,One Star,1410912000,,,,automotive
3,1,True,"06 11, 2016",EB561158A2829D98B467FE03CC1E45F1,37AB9A82470595E0ACB88BAC48C150EE,F4892A77EA45C52F40AB17ED537EF9FF,"first use, it leaked instantly. even at 5 buck...",One Star,1465603200,,,"{'Color:': ' Clear', 'Style:': ' 45 Degree'}",automotive
4,1,True,"12 23, 2017",5045D801332850D21618DD13A697CD9B,5772FF30428EEB8E0258C1A53CA2EC50,522F0BBFF2B47F1D63FF781A0AB1D079,didn't fit,One Star,1513987200,,,,automotive


Classify Reviews
***************************************************************************************************************************************************************************

In [52]:
def label(rating, cutoff):
    if rating <= cutoff:
        return 0
    else:
        return 1

for cutoff in [1, 2, 3, 4]:
    training['label ' + str(cutoff)] = training['overall'].apply(lambda x: label(x, cutoff))

Clean data
***************************************************************************************************************************************************************************

In [53]:
# account for blank spots
training.fillna('', inplace = True)
test.fillna('', inplace = True)

Vectorize data
***************************************************************************************************************************************************************************

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# set up train test split
x1 = training['reviewText']
x2 = training["summary"]
y = training['label 4']

# vectorize the text 
vectorizer1 = TfidfVectorizer(ngram_range = (1, 2))
vectorizer2 = TfidfVectorizer(ngram_range = (1, 2))

# get review text frequency dict
x1 = vectorizer1.fit_transform(x1)
x2 = vectorizer2.fit_transform(x2)

x = hstack((x1, x2))

# create train test split for cross
train_x_tf, test_x_tf, train_y, test_y = train_test_split(x, y, test_size=.1, random_state=16)

# Classifier #1 : Naive bayes
    step 1) train
    step 2) report
    step 3) tune
    step 4) report

In [55]:
# perform naive bayes 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb = MultinomialNB()
nb.fit(train_x_tf, train_y)

# predict the label of the test samples here using the model
test_predicted_nb = nb.predict(test_x_tf)

# get model acuracy score
print("Accuracy: \n")
print(accuracy_score(test_y, test_predicted_nb))

Accuracy: 

0.8574854402192532


PRE - tune report
***************************************************************************************************************************************************************************

In [56]:
from sklearn.metrics import roc_curve, auc, classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt

# confusion matrix
confusion = confusion_matrix(test_y, test_predicted_nb)
print('Confusion Matrix:\n')
print(confusion)

# f1
print('\nMacro F1-score: {:.2f}\n'.format(f1_score(test_y, test_predicted_nb, average='macro')))

# roc auc score
fpr, tpr, t =  roc_curve(test_y, test_predicted_nb)
roc_score = auc(fpr, tpr)
print("ROC AUC:", roc_score)

# classification report
print("\nClassification report: \n")
print(classification_report(test_predicted_nb, test_y))

Confusion Matrix:

[[2360    2]
 [ 414  143]]

Macro F1-score: 0.66

ROC AUC: 0.6279428777304326

Classification report: 

              precision    recall  f1-score   support

           0       1.00      0.85      0.92      2774
           1       0.26      0.99      0.41       145

    accuracy                           0.86      2919
   macro avg       0.63      0.92      0.66      2919
weighted avg       0.96      0.86      0.89      2919



Hypertune
***************************************************************************************************************************************************************************

In [57]:
from sklearn.model_selection import GridSearchCV

param_grid_nb = [{
    "alpha": [0.1, 0.5, 1.0, 1.5, 2.0], 
    "force_alpha": [True, False], 
    "fit_prior": [True, False]
}]

nb_gs = MultinomialNB()

grid_search = GridSearchCV(nb_gs, param_grid_nb, cv=5, scoring="accuracy", return_train_score=True)

# train it
grid_search.fit(train_x_tf, train_y)

# get model with best params
best_params_nb = grid_search.best_params_
print("Best parameters: ")
print(best_params_nb)

# get best score
best_score_nb = grid_search.best_score_
print("\nBest accuracy: ")
print(best_score_nb)

# get predected y of best model
tuned_test_predicted_nb = grid_search.predict(test_x_tf)

Best parameters: 
{'alpha': 0.1, 'fit_prior': False, 'force_alpha': True}

Best accuracy: 
0.8693566806242862


POST - tune report
***************************************************************************************************************************************************************************

In [58]:
# confusion matrix
confusion = confusion_matrix(test_y, tuned_test_predicted_nb)
print('Confusion Matrix: \n')
print(confusion)

# f1
print('\nMacro F1-score: {:.2f}\n'.format(f1_score(test_y, tuned_test_predicted_nb, average='macro')))

# roc auc score
fpr, tpr, t =  roc_curve(test_y, tuned_test_predicted_nb)
roc_score = auc(fpr, tpr)
print("ROC AUC:", roc_score)

# classification report
print("\nClassification report: \n")
print(classification_report(tuned_test_predicted_nb, test_y))

Confusion Matrix: 

[[2136  226]
 [ 187  370]]

Macro F1-score: 0.78

ROC AUC: 0.784295632371921

Classification report: 

              precision    recall  f1-score   support

           0       0.90      0.92      0.91      2323
           1       0.66      0.62      0.64       596

    accuracy                           0.86      2919
   macro avg       0.78      0.77      0.78      2919
weighted avg       0.86      0.86      0.86      2919



# Classifier #2 : Logistic Regression
    step 1) train
    step 2) report
    step 3) tune
    step 4) report

In [59]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty = 'l2', solver = 'saga')
lr.fit(train_x_tf, train_y)

# predict the label of the test samples here using the model
test_predicted_lr = lr.predict(test_x_tf)

# get model acuracy score
print("Accuracy: ")
print(accuracy_score(test_y, test_predicted_lr))

Accuracy: 
0.8910585817060637


PRE - tune report
***************************************************************************************************************************************************************************

In [60]:
# confusion matrix
confusion = confusion_matrix(test_y, test_predicted_lr)
print('Confusion Matrix: \n')
print(confusion)

# f1
print('\nMacro F1-score: {:.2f}\n'.format(f1_score(test_y, test_predicted_lr, average='macro')))

# roc auc score
fpr, tpr, t =  roc_curve(test_y, test_predicted_lr)
roc_score = auc(fpr, tpr)
print("ROC AUC:", roc_score)

# classification report
print("\nClassification report: \n")
print(classification_report(test_predicted_lr, test_y))

Confusion Matrix: 

[[2324   38]
 [ 280  277]]

Macro F1-score: 0.79

ROC AUC: 0.7406094704150241

Classification report: 

              precision    recall  f1-score   support

           0       0.98      0.89      0.94      2604
           1       0.50      0.88      0.64       315

    accuracy                           0.89      2919
   macro avg       0.74      0.89      0.79      2919
weighted avg       0.93      0.89      0.90      2919



Hypertune
***************************************************************************************************************************************************************************

In [61]:
param_grid_lr = [{
    "penalty": ['l1', 'l2'], 
    "fit_intercept": [True, False], 
    "C": [0.1, .9, 1.1, 1, 5, 10], 
    "max_iter": [300, 400, 500, 550]
}]

lr_gs = LogisticRegression()

grid_search = GridSearchCV(lr_gs, param_grid_lr, cv=5, scoring="accuracy", return_train_score=True)

# train it
grid_search.fit(train_x_tf, train_y)

# get model with best params
best_params_lr = grid_search.best_params_
print("Best parameters: ")
print(best_params_lr)

# get best score
best_score_lr = grid_search.best_score_
print("\nBest accuracy: ")
print(best_score_lr)

# get predected y of best model
tuned_test_predicted_lr = grid_search.predict(test_x_tf)

240 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/logantucker/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/logantucker/anaconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/logantucker/anaconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties

Best parameters: 
{'C': 10, 'fit_intercept': False, 'max_iter': 300, 'penalty': 'l2'}

Best accuracy: 
0.8963837076513134


POST - tune report
***************************************************************************************************************************************************************************

In [62]:
# confusion matrix
confusion = confusion_matrix(test_y, tuned_test_predicted_lr)
print('Confusion Matrix: \n')
print(confusion)

# f1
print('\nMacro F1-score: {:.2f}\n'.format(f1_score(test_y, tuned_test_predicted_lr, average='macro')))

# roc auc score
fpr, tpr, t =  roc_curve(test_y, tuned_test_predicted_lr)
roc_score = auc(fpr, tpr)
print("ROC AUC:", roc_score)

# classification report
print("\nClassification report: \n")
print(classification_report(tuned_test_predicted_lr, test_y))

Confusion Matrix: 

[[2287   75]
 [ 220  337]]

Macro F1-score: 0.82

ROC AUC: 0.7866370890384408

Classification report: 

              precision    recall  f1-score   support

           0       0.97      0.91      0.94      2507
           1       0.61      0.82      0.70       412

    accuracy                           0.90      2919
   macro avg       0.79      0.87      0.82      2919
weighted avg       0.92      0.90      0.90      2919



# Classifier #3 : Support vector machines
    step 1) train
    step 2) report
    step 3) tune
    step 4) report

In [63]:
from sklearn.svm import LinearSVC

sv = LinearSVC()
sv.fit(train_x_tf, train_y)

# predict the label of the test samples here using the model
test_predicted_sv = sv.predict(test_x_tf)

# get model acuracy score
print("Accuracy: ")
print(accuracy_score(test_y, test_predicted_sv))

Accuracy: 
0.8982528263103803


PRE - tune report
***************************************************************************************************************************************************************************

In [64]:
# confusion matrix
confusion = confusion_matrix(test_y, test_predicted_sv)
print('Confusion Matrix: \n')
print(confusion)

# f1
print('\nMacro F1-score: {:.2f}\n'.format(f1_score(test_y, test_predicted_sv, average='macro')))

# roc auc score
fpr, tpr, t =  roc_curve(test_y, test_predicted_sv)
roc_score = auc(fpr, tpr)
print("ROC AUC:", roc_score)

# classification report
print("\nClassification report: \n")
print(classification_report(test_predicted_sv, test_y))

Confusion Matrix: 

[[2291   71]
 [ 226  331]]

Macro F1-score: 0.81

ROC AUC: 0.7820978326799094

Classification report: 

              precision    recall  f1-score   support

           0       0.97      0.91      0.94      2517
           1       0.59      0.82      0.69       402

    accuracy                           0.90      2919
   macro avg       0.78      0.87      0.81      2919
weighted avg       0.92      0.90      0.90      2919



Hypertune
***************************************************************************************************************************************************************************

In [69]:
param_grid_sv = [{
    "penalty": ['l2'], 
    "loss": ['hinge'], 
    "dual": [True], 
    "C": [0.5, 1, 1.2], 
    "max_iter": [800, 900]
}]
    
sv_gs = LinearSVC()

grid_search = GridSearchCV(sv_gs, param_grid_sv, cv=5, scoring="accuracy", return_train_score=True)

# train it
grid_search.fit(train_x_tf, train_y)

# get model with best params
best_params_sv = grid_search.best_params_
print("Best parameters: ")
print(best_params_sv)

# get best score
best_score_sv = grid_search.best_score_
print("\nBest accuracy: ")
print(best_score_sv)

# get predected y of best model
tuned_test_predicted_sv = grid_search.predict(test_x_tf)

Best parameters: 
{'C': 1, 'dual': True, 'loss': 'hinge', 'max_iter': 800, 'penalty': 'l2'}

Best accuracy: 
0.8954701180053293


POST - tune report
***************************************************************************************************************************************************************************

In [66]:
# confusion matrix
confusion = confusion_matrix(test_y, tuned_test_predicted_sv)
print('Confusion Matrix\n')
print(confusion)

# f1
print('\nMacro F1-score: {:.2f}\n'.format(f1_score(test_y, tuned_test_predicted_sv, average='macro')))

# roc auc score
fpr, tpr, t =  roc_curve(test_y, tuned_test_predicted_sv)
roc_score = auc(fpr, tpr)
print("ROC AUC:", roc_score)

# classification report
print("\nClassification report: \n")
print(classification_report(tuned_test_predicted_sv, test_y))

Confusion Matrix

[[2283   79]
 [ 210  347]]

Macro F1-score: 0.82

ROC AUC: 0.7947670096698627

Classification report: 

              precision    recall  f1-score   support

           0       0.97      0.92      0.94      2493
           1       0.62      0.81      0.71       426

    accuracy                           0.90      2919
   macro avg       0.79      0.87      0.82      2919
weighted avg       0.92      0.90      0.91      2919

