# CS 74 Final Project
## Elizabeth Frey
## Spring 2022

In [133]:
import autograd.numpy as np
from autograd import grad 
import pandas as pd 

%matplotlib inline

In [134]:
train = pd.read_table('Training.csv', dtype=str, header=None, sep=',', usecols=(0,6,7), skiprows=1).fillna('').values
test = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=(5,6), skiprows=1).fillna('').values

In [135]:
# from https://colab.research.google.com/drive/11bLTHJCJ8VBk0rO_7RvVkzwlHRtSXx8f

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

train_df = pd.DataFrame(train)
train_df.columns = ['overall', 'reviewText', 'summary']
test_df = pd.DataFrame(test)
test_df.columns = ['reviewText', 'summary']

vectorizer_detail = TfidfVectorizer(min_df = 20)
train_detail = vectorizer_detail.fit_transform(train_df[['reviewText']].reviewText.tolist())
test_detail = vectorizer_detail.transform(test_df[['reviewText']].reviewText.tolist())

vectorizer_summary = TfidfVectorizer(min_df = 20)
train_summary = vectorizer_summary.fit_transform(train_df[['summary']].summary.tolist())
test_summary = vectorizer_summary.transform(test_df[['summary']].summary.tolist())

train_features = hstack((train_detail, train_summary)).tocsr()
train_label = train_df[['overall']].copy().astype(float)
test_features = hstack((test_detail, test_summary)).tocsr()


#  Binary Classifier

In [136]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score, confusion_matrix

from sklearn.model_selection import KFold

def evaluate(labels, predictions):
    assert len(labels) == len(predictions), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, predictions)
    cm = confusion_matrix(labels, predictions)
    return accuracy, precision, recall, f1, cm, roc_auc


In [137]:
def kfold(model, updated_label):
    kf = KFold(n_splits=5)

    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : [],
               'CM' : [],
               'ROC_AUC Score' : []}

    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f, cm, roc_auc = evaluate(y_test, predictions)
        results['Accuracy'].append(a)
        results['Precision'].append(p)
        results['Recall'].append(r)
        results['F1'].append(f)
        results['CM'].append(cm)
        results['ROC_AUC Score'].append(roc_auc)
        
    print('\tK Fold F1 average: ', sum(results['F1'])/len(results['F1']))
    print('\tK Fold Accuracy average: ', sum(results['Accuracy'])/len(results['Accuracy']))
    print('\tK Fold Precision average: ', sum(results['Precision'])/len(results['Precision']))
    total_cm = np.array([[0,0],[0,0]])
    for element in results['CM']:
        total_cm = np.add(total_cm, element)
    average_cm = total_cm * 0.2
    print('\tK Fold CM average: ')
    print(f'\t\t{average_cm[0]}')
    print(f'\t\t{average_cm[1]}')
    print('\tK Fold ROC AUC average: ', sum(results['ROC_AUC Score'])/len(results['ROC_AUC Score']))

def model_to_csv(model, updated_label, cutoff):
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)
    

## Binary Classifier for $<=1$ and $>1$

In [138]:

def binary_classifier_one():
    
    cutoff = 1
    
    print(f'Cutoff: 1')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    model = LogisticRegression()

    print('Logistic Regression')
    
    kfold(model,updated_label)

    model = Perceptron()

    print('Perceptron')

    kfold(model,updated_label)

    model = LinearSVC()

    print('Linear SVC')

    kfold(model,updated_label)

    best_model = LogisticRegression(solver='liblinear', fit_intercept = False)

    print('Best Model Tuning: Logistic Regression')
    
    kfold(best_model,updated_label)
    
    model_to_csv(best_model, updated_label, cutoff)
    
binary_classifier_one()


Cutoff: 1
Logistic Regression
	K Fold F1 average:  0.6796277679147806
	K Fold Accuracy average:  0.7827646671424212
	K Fold Precision average:  0.7459048693259342
	K Fold CM average: 
		[510.4 681. ]
		[ 587.2 4059.2]
	K Fold ROC AUC average:  0.6625046069868731
Perceptron
	K Fold F1 average:  0.6257507968056133
	K Fold Accuracy average:  0.7217820740837516
	K Fold Precision average:  0.646096328824284
	K Fold CM average: 
		[562.  629.4]
		[ 994.8 3651.6]
	K Fold ROC AUC average:  0.6370942280294017
Linear SVC
	K Fold F1 average:  0.6643599715883497
	K Fold Accuracy average:  0.7562139094128647
	K Fold Precision average:  0.7005318987203962
	K Fold CM average: 
		[565.2 626.2]
		[ 797.  3849.4]
	K Fold ROC AUC average:  0.6618426085579456
Best Model Tuning: Logistic Regression
	K Fold F1 average:  0.6805996598841982
	K Fold Accuracy average:  0.7785166252567832
	K Fold Precision average:  0.7323587661248618
	K Fold CM average: 
		[541.2 650.2]
		[ 642.8 4003.6]
	K Fold ROC AUC average

## Binary Classifier for $<=2$ and $>2$

In [139]:
def binary_classifier_two():
    
    cutoff = 2
    
    print(f'Cutoff: 2')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    model = LogisticRegression()
    
    print('Logistic Regression')

    kfold(model,updated_label)
    
    model = Perceptron()

    print('Perceptron')

    kfold(model,updated_label)

    model = LinearSVC()

    print('Linear SVC')

    kfold(model,updated_label)

    best_model = LogisticRegression( C=0.3, fit_intercept=False, max_iter=1000)
    print('Best Model Tuning: Logistic Regression')
    
    kfold(best_model,updated_label)

    model_to_csv(best_model, updated_label, cutoff)    
    
binary_classifier_two()

Cutoff: 2
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	K Fold F1 average:  0.7197231504207975
	K Fold Accuracy average:  0.7298347601563381
	K Fold Precision average:  0.7219359486001979
	K Fold CM average: 
		[1689.6  693.6]
		[ 883.6 2571. ]
	K Fold ROC AUC average:  0.7244190057260008
Perceptron
	K Fold F1 average:  0.6654991161002977
	K Fold Accuracy average:  0.6772116402181616
	K Fold Precision average:  0.6677924578229741
	K Fold CM average: 
		[1675.   708.2]
		[1176.2 2278.4]
	K Fold ROC AUC average:  0.6757915881756642
Linear SVC
	K Fold F1 average:  0.696626657910855
	K Fold Accuracy average:  0.7079772262368278
	K Fold Precision average:  0.6990062168110781
	K Fold CM average: 
		[1679.8  703.4]
		[1001.4 2453.2]
	K Fold ROC AUC average:  0.7032986340580065
Best Model Tuning: Logistic Regression
	K Fold F1 average:  0.726079198434423
	K Fold Accuracy average:  0.7363094863936062
	K Fold Precision average:  0.7278566800997289
	K Fold CM average: 
		[1679.   704.2]
		[ 835.2 2619.4]
	K Fold ROC AUC average:  0.7302619482258093


## Binary Classifier for $<=3$ and $>3$

In [140]:
def binary_classifier_three():
    
    cutoff = 3
    
    print(f'Cutoff: 3')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    model = LogisticRegression()

    print('Logistic Regression')
    
    kfold(model,updated_label)

    model = Perceptron()

    print('Perceptron')

    kfold(model,updated_label)

    model = LinearSVC()

    print('Linear SVC')

    kfold(model,updated_label)
    
    best_model = LogisticRegression(class_weight='balanced', max_iter=1000, fit_intercept=False, C=0.3)

    print('Best Model Tuning: Logistic Regression')
    
    kfold(best_model,updated_label)

    model_to_csv(best_model, updated_label, cutoff)
    
binary_classifier_three()


Cutoff: 3
Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	K Fold F1 average:  0.7528386426928175
	K Fold Accuracy average:  0.7864292848254009
	K Fold Precision average:  0.7817389726844853
	K Fold CM average: 
		[3182.6  373. ]
		[ 873.8 1408.4]
	K Fold ROC AUC average:  0.7507880758651259
Perceptron
	K Fold F1 average:  0.6957996614261592
	K Fold Accuracy average:  0.7311343279570034
	K Fold Precision average:  0.71014242273969
	K Fold CM average: 
		[2952.4  603.2]
		[ 966.4 1315.8]
	K Fold ROC AUC average:  0.6982575866201061
Linear SVC
	K Fold F1 average:  0.7353450663951919
	K Fold Accuracy average:  0.7682035658337913
	K Fold Precision average:  0.7536118243371192
	K Fold CM average: 
		[3076.4  479.2]
		[ 874.  1408.2]
	K Fold ROC AUC average:  0.7354036497547508
Best Model Tuning: Logistic Regression
	K Fold F1 average:  0.7656871587581084
	K Fold Accuracy average:  0.7896842877150837
	K Fold Precision average:  0.7751342213329491
	K Fold CM average: 
		[3027.6  528. ]
		[ 699.8 1582.4]
	K Fold ROC AUC average:  0.7668266944284557


## Binary Classifier for $<=4$ and $>4$

In [141]:
def binary_classifier_four():
    
    cutoff = 4
    
    print(f'Cutoff: 4')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    model = LogisticRegression()

    print('Logistic Regression')
    
    kfold(model,updated_label)

    model = Perceptron()

    print('Perceptron')

    kfold(model,updated_label)

    model = LinearSVC()

    print('Linear SVC')

    kfold(model,updated_label)
    
    best_model = LogisticRegression(class_weight = {0:.2, 1:.8}, C=0.7)
    print('Best Model Tuning: Logistic Regression')
    
    kfold(best_model,updated_label)

    model_to_csv(best_model, updated_label, cutoff)
    
binary_classifier_four()


Cutoff: 4
Logistic Regression
	K Fold F1 average:  0.748666126381887
	K Fold Accuracy average:  0.8753621435312164
	K Fold Precision average:  0.8504323699466927
	K Fold CM average: 
		[4597.2  112.2]
		[615.4 513. ]
	K Fold ROC AUC average:  0.7143237049479326
Perceptron
	K Fold F1 average:  0.7031473978579879
	K Fold Accuracy average:  0.834080272432486
	K Fold Precision average:  0.7272768547130971
	K Fold CM average: 
		[4328.   381.4]
		[587.2 541.2]
	K Fold ROC AUC average:  0.6975559261394103
Linear SVC
	K Fold F1 average:  0.7380234689596734
	K Fold Accuracy average:  0.8613506130898898
	K Fold Precision average:  0.7952024853978947
	K Fold CM average: 
		[4478.4  231. ]
		[578.4 550. ]
	K Fold ROC AUC average:  0.7170190461514305
Best Model Tuning: Logistic Regression
	K Fold F1 average:  0.7719239336410528
	K Fold Accuracy average:  0.8604268360929848
	K Fold Precision average:  0.7707248471190191
	K Fold CM average: 
		[4271.8  437.6]
		[377.2 751.2]
	K Fold ROC AUC average:

# Multiclass Classifier

In [142]:
from sklearn.calibration import CalibratedClassifierCV

def multiclass_evaluate(labels, predictions):
    assert len(labels) == len(predictions), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    cm = confusion_matrix(labels, predictions)
    return accuracy, precision, recall, f1, cm

# had to make new kfold because old one didnt use .iloc
def multiclass_kfold(model):
    kf = KFold(n_splits=5)
    
    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : [],
               'CM' : [],
               'ROC_AUC Score' : []}

    for train_index, test_index in kf.split(train_features,  np.array(train_label).flatten()):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = np.array(train_label).flatten()[train_index]
        y_test = np.array(train_label).flatten()[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f, cm = multiclass_evaluate(y_test, predictions)
        if (type(model) == type (LogisticRegression())):
            predict_proba = model.predict_proba(x_test)
            roc_auc = roc_auc_score(y_test, predict_proba, multi_class = 'ovr')
        else:
            calibrated = CalibratedClassifierCV(model)
            calibrated.fit(x_train, y_train)
            predict_proba = calibrated.predict_proba(x_test)
            roc_auc = roc_auc_score(y_test, predict_proba, multi_class = 'ovr')
        results['Accuracy'].append(a)
        results['Precision'].append(p)
        results['Recall'].append(r)
        results['F1'].append(f)
        results['CM'].append(cm)
        results['ROC_AUC Score'].append(roc_auc)
        
    print('\tK Fold F1 average: ', sum(results['F1'])/len(results['F1']))
    print('\tK Fold Accuracy average: ', sum(results['Accuracy'])/len(results['Accuracy']))
    print('\tK Fold Precision average: ', sum(results['Precision'])/len(results['Precision']))
    total_cm = np.zeros((5,5))
    for element in results['CM']:
        total_cm = np.add(total_cm, element)
    average_cm = total_cm * 0.2
    print('\tK Fold CM average: ')
    print(f'\t\t{average_cm[0]}')
    print(f'\t\t{average_cm[1]}')
    print(f'\t\t{average_cm[2]}')
    print(f'\t\t{average_cm[3]}')
    print(f'\t\t{average_cm[4]}')
    print('\tK Fold ROC AUC average: ', sum(results['ROC_AUC Score'])/len(results['ROC_AUC Score']))

def multiclass():
    model = LogisticRegression(multi_class='ovr')
    print('Logistic Regression')
    multiclass_kfold(model)

    model = Perceptron()
    print('Perceptron')
    multiclass_kfold(model)

    model = LinearSVC()
    print('Linear SVC')
    multiclass_kfold(model)
    
    print('Best Model: Logistic Regression')
    best_model = LogisticRegression(multi_class='ovr', class_weight = 'balanced', C=0.2, max_iter=1000)
    multiclass_kfold(best_model)
    model_to_csv(best_model, np.array(train_label).flatten(), 'multiclass')

multiclass()


Logistic Regression
	K Fold F1 average:  0.46658369767293306
	K Fold Accuracy average:  0.46319242704174857
	K Fold Precision average:  0.5130647347530255
	K Fold CM average: 
		[706.  262.6 105.2  61.4  56.2]
		[404.  406.8 230.2 107.6  43.2]
		[237.6 229.4 475.8 178.8  50.8]
		[197.2 122.4 197.  446.6 190.6]
		[173.4  66.6  54.8 164.8 668.8]
	K Fold ROC AUC average:  0.7961020820717541
Perceptron
	K Fold F1 average:  0.40827611691786486
	K Fold Accuracy average:  0.4066976898913577
	K Fold Precision average:  0.4387882987795069
	K Fold CM average: 
		[580.4 260.4 159.6 102.2  88.8]
		[361.4 383.6 220.6 140.   86.2]
		[258.6 207.8 428.6 188.4  89. ]
		[223.4 134.4 175.6 419.8 200.6]
		[195.   95.8  85.  190.8 561.8]
	K Fold ROC AUC average:  0.7444819972312416
Linear SVC
	K Fold F1 average:  0.44178797525430386
	K Fold Accuracy average:  0.43941615204373374
	K Fold Precision average:  0.4778079921828221
	K Fold CM average: 
		[653.4 259.2 132.2  81.4  65.2]
		[380.  399.6 230.6 119.2 

# Clustering

In [143]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import rand_score
from scipy.sparse import hstack


cluster_data = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=(5,6,11), skiprows=1).fillna('').values

cluster_df = pd.DataFrame(cluster_data, columns = ['reviewText', 'summary', 'category'])
y_cluster_df = cluster_df[['category']].copy()

vectorizer_detail = TfidfVectorizer(stop_words='english', min_df = 0.2, max_df = 0.9)
train_detail = vectorizer_detail.fit_transform(cluster_df[['reviewText']].reviewText.tolist())

vectorizer_summary = TfidfVectorizer(stop_words='english', min_df = 0.1, max_df = 0.9)
train_summary = vectorizer_summary.fit_transform(cluster_df[['summary']].summary.tolist())

train_features = hstack((train_detail, train_summary)).tocsr()


label = set(y_cluster_df.category)
num_labels = len(label)
label = np.array(list(label))
indexes = list(range(0,num_labels))
label = label.reshape([num_labels])
y_cluster_df.category.replace(label, indexes, inplace=True)

In [144]:
def clustering():
    kmeans = KMeans(n_clusters=num_labels)
    kmeans.fit(train_features)
    print(f'Silhouette Score: {silhouette_score(train_features, kmeans.labels_)}')
    print(f'Random Score: {rand_score(np.array(y_cluster_df).flatten(), kmeans.labels_)}')
    
clustering()


Silhouette Score: 0.9554411607742243
Random Score: 0.6250878466819787
