# CS 74 Final Project
## Elizabeth Frey
## Spring 2022

In [19]:
import autograd.numpy as np
from autograd import grad 
import pandas as pd 

%matplotlib inline

In [85]:
train = pd.read_table('Training.csv', dtype=str, header=None, sep=',', usecols=(0,6,7), skiprows=1).fillna('').values
test = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=(5,6), skiprows=1).fillna('').values

In [104]:
# from https://colab.research.google.com/drive/11bLTHJCJ8VBk0rO_7RvVkzwlHRtSXx8f

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

train_df = pd.DataFrame(train)
train_df.columns = ['overall', 'reviewText', 'summary']
test_df = pd.DataFrame(test)
test_df.columns = ['reviewText', 'summary']

vectorizer_detail = TfidfVectorizer(min_df = 20, sublinear_tf=True)
train_detail = vectorizer_detail.fit_transform(train_df[['reviewText']].reviewText.tolist())
test_detail = vectorizer_detail.transform(test_df[['reviewText']].reviewText.tolist())

vectorizer_summary = TfidfVectorizer(min_df = 20)
train_summary = vectorizer_summary.fit_transform(train_df[['summary']].summary.tolist())
test_summary = vectorizer_summary.transform(test_df[['summary']].summary.tolist())


train_features = hstack((train_detail, train_summary)).tocsr()
train_label = train_df[['overall']].copy().astype(float)
test_features = hstack((test_detail, test_summary)).tocsr()



#  Binary Classifier

In [30]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score, confusion_matrix

from sklearn.model_selection import KFold

def evaluate(labels, predictions):
    assert len(labels) == len(predictions), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    return accuracy, precision, recall, f1


In [31]:
def kfold(model, updated_label):
    kf = KFold(n_splits=5)

    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results['Accuracy'].append(a)
        results['Precision'].append(p)
        results['Recall'].append(r)
        results['F1'].append(f)
        
    print('\tK Fold F1 average: ', sum(results['F1'])/len(results['F1']))

def model_to_csv(model, updated_label, cutoff):
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)
    

## Binary Classifier for $<=1$ and $>1$

In [126]:

def binary_classifier_one():
    
    cutoff = 1
    
    print(f'Cutoff: 1')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    

    best_model = LogisticRegression(solver='saga', C=0.5, class_weight='balanced')

    print('Best Model Tuning: Logistic Regression')
    
    kfold(best_model,updated_label)
    
    model_to_csv(best_model, updated_label, cutoff)
    
binary_classifier_one()


Cutoff: 1
Best Model Tuning: Logistic Regression
	K Fold F1 average:  0.6698201194540728


## Binary Classifier for $<=2$ and $>2$

In [124]:
def binary_classifier_two():
    
    cutoff = 2
    
    print(f'Cutoff: 2')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0

    best_model = LogisticRegression( C=0.3, max_iter=1000, class_weight='balanced')
    print('Best Model Tuning: Logistic Regression')
    
    kfold(best_model,updated_label)

    model_to_csv(best_model, updated_label, cutoff)    
    
binary_classifier_two()

Cutoff: 2
Best Model Tuning: Logistic Regression
	K Fold F1 average:  0.7260015273648435


## Binary Classifier for $<=3$ and $>3$

In [123]:
def binary_classifier_three():
    
    cutoff = 3
    
    print(f'Cutoff: 3')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    best_model = LogisticRegression(class_weight='balanced', max_iter=1000, C=0.4)

    print('Best Model Tuning: Logistic Regression')
    
    kfold(best_model,updated_label)

    model_to_csv(best_model, updated_label, cutoff)
    
binary_classifier_three()


Cutoff: 3
Best Model Tuning: Logistic Regression
	K Fold F1 average:  0.7687222680046666


## Binary Classifier for $<=4$ and $>4$

In [15]:
def binary_classifier_four():
    
    cutoff = 4
    
    print(f'Cutoff: 4')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    best_model = LogisticRegression(class_weight = {0:.2, 1:.8}, C=0.7)
    print('Best Model Tuning: Logistic Regression')
    
    kfold(best_model,updated_label)

    model_to_csv(best_model, updated_label, cutoff)
    
binary_classifier_four()


Cutoff: 4
Best Model Tuning: Logistic Regression
	K Fold F1 average:  0.7719239336410528


# Multiclass Classifier

In [134]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC

def multiclass_evaluate(labels, predictions):
    assert len(labels) == len(predictions), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    return accuracy, precision, recall, f1

# had to make new kfold because old one didnt use .iloc
def multiclass_kfold(model):
    kf = KFold(n_splits=5)
    
    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : [],}

    for train_index, test_index in kf.split(train_features,  np.array(train_label).flatten()):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = np.array(train_label).flatten()[train_index]
        y_test = np.array(train_label).flatten()[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = multiclass_evaluate(y_test, predictions)
        results['Accuracy'].append(a)
        results['Precision'].append(p)
        results['Recall'].append(r)
        results['F1'].append(f)
        
    print('\tK Fold F1 average: ', sum(results['F1'])/len(results['F1']))

def multiclass():
    
    print('Best Model: Logistic Regression')
    best_model = LogisticRegression(multi_class='ovr', solver = 'saga', penalty='elasticnet', l1_ratio=0.2, class_weight = 'balanced', C=0.2, max_iter=1000)
    multiclass_kfold(best_model)
    model_to_csv(best_model, np.array(train_label).flatten(), 'multiclass')

multiclass()


Best Model: Logistic Regression
	K Fold F1 average:  0.48785203903729074


# Clustering

In [17]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import rand_score
from scipy.sparse import hstack


cluster_data = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=(5,6,11), skiprows=1).fillna('').values

cluster_df = pd.DataFrame(cluster_data, columns = ['reviewText', 'summary', 'category'])
y_cluster_df = cluster_df[['category']].copy()

vectorizer_detail = TfidfVectorizer(stop_words='english', min_df = 0.2, max_df = 0.9)
train_detail = vectorizer_detail.fit_transform(cluster_df[['reviewText']].reviewText.tolist())

vectorizer_summary = TfidfVectorizer(stop_words='english', min_df = 0.1, max_df = 0.9)
train_summary = vectorizer_summary.fit_transform(cluster_df[['summary']].summary.tolist())

train_features = hstack((train_detail, train_summary)).tocsr()


label = set(y_cluster_df.category)
num_labels = len(label)
label = np.array(list(label))
indexes = list(range(0,num_labels))
label = label.reshape([num_labels])
y_cluster_df.category.replace(label, indexes, inplace=True)

In [18]:
def clustering():
    kmeans = KMeans(n_clusters=num_labels)
    kmeans.fit(train_features)
    print(f'Silhouette Score: {silhouette_score(train_features, kmeans.labels_)}')
    print(f'Random Score: {rand_score(np.array(y_cluster_df).flatten(), kmeans.labels_)}')
    
clustering()


Silhouette Score: 0.9554411607742243
Random Score: 0.6250878466819787
