In [None]:
import autograd.numpy as np
from autograd import grad 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
train = pd.read_table('Training.csv', dtype=str, header=None, sep=',', usecols=(0,6,7), skiprows=1).fillna('').values
test = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=(5,6), skiprows=1).fillna('').values

In [4]:
# from https://colab.research.google.com/drive/11bLTHJCJ8VBk0rO_7RvVkzwlHRtSXx8f

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

train_df = pd.DataFrame(train)
train_df.columns = ['overall', 'reviewText', 'summary']
test_df = pd.DataFrame(test)
test_df.columns = ['reviewText', 'summary']

vectorizer_detail = TfidfVectorizer(min_df = 20)
train_detail = vectorizer_detail.fit_transform(train_df[['reviewText']].reviewText.tolist())
test_detail = vectorizer_detail.transform(test_df[['reviewText']].reviewText.tolist())

vectorizer_summary = TfidfVectorizer(min_df = 20)
train_summary = vectorizer_summary.fit_transform(train_df[['summary']].summary.tolist())
test_summary = vectorizer_summary.transform(test_df[['summary']].summary.tolist())

train_features = hstack((train_detail, train_summary)).tocsr()
train_label = train_df[['overall']].copy().astype(float)
test_features = hstack((test_detail, test_summary)).tocsr()


#  Binary Classifier

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, accuracy_score, confusion_matrix

from sklearn.model_selection import KFold

def evaluate(labels, predictions):
    assert len(labels) == len(predictions), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, predictions)
    cm = confusion_matrix(labels, predictions)
    return accuracy, precision, recall, f1, cm, roc_auc


In [6]:
def kfold(model, updated_label):
    kf = KFold(n_splits=5)

    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : [],
               'CM' : [],
               'ROC_AUC Score' : []}

    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f, cm, roc_auc = evaluate(y_test, predictions)
        results['Accuracy'].append(a)
        results['Precision'].append(p)
        results['Recall'].append(r)
        results['F1'].append(f)
        results['CM'].append(cm)
        results['ROC_AUC Score'].append(roc_auc)
        
    print('\tK Fold F1 average: ', sum(results['F1'])/len(results['F1']))
    print('\tK Fold Accuracy average: ', sum(results['Accuracy'])/len(results['Accuracy']))
    print('\tK Fold Precision average: ', sum(results['Precision'])/len(results['Precision']))
    total_cm = np.array([[0,0],[0,0]])
    for element in results['CM']:
        total_cm = np.add(total_cm, element)
    average_cm = total_cm * 0.2
    print('\tK Fold CM average: ')
    print(f'\t\t{average_cm[0]}')
    print(f'\t\t{average_cm[1]}')
    print('\tK Fold ROC AUC average: ', sum(results['ROC_AUC Score'])/len(results['ROC_AUC Score']))

def model_to_csv(model, updated_label, cutoff):
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)
    

In [7]:
a = np.array([[1,2,],[3,4]])
b = np.array([[1,2,],[3,4]])

print(np.add(a,b,a))

[[2 4]
 [6 8]]


## Binary Classifier for $<=1$ and $>1$

In [8]:

def binary_classifier_one():
    
    cutoff = 1
    
    print(f'Cutoff: 1')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    model = LogisticRegression(solver = 'liblinear')

    print('Logistic Regression')
    
    kfold(model,updated_label)

    model = Perceptron()

    print('Perceptron')

    kfold(model,updated_label)

    model = LinearSVC()

    print('Linear SVC')

    kfold(model,updated_label)
    
    best_model = LogisticRegression(solver = 'liblinear')
    model_to_csv(best_model, updated_label, cutoff)
    
binary_classifier_one()


Cutoff: 1
Logistic Regression
	K Fold F1 average:  0.6795102457800848
	K Fold Accuracy average:  0.7826618863503386
	K Fold Precision average:  0.7457860655903235
	K Fold CM average: 
		[510.4 681. ]
		[ 587.8 4058.6]
	K Fold ROC AUC average:  0.6624421936082514
Perceptron
	K Fold F1 average:  0.6257507968056133
	K Fold Accuracy average:  0.7217820740837516
	K Fold Precision average:  0.646096328824284
	K Fold CM average: 
		[562.  629.4]
		[ 994.8 3651.6]
	K Fold ROC AUC average:  0.6370942280294017
Linear SVC
	K Fold F1 average:  0.6643599715883497
	K Fold Accuracy average:  0.7562139094128647
	K Fold Precision average:  0.7005318987203962
	K Fold CM average: 
		[565.2 626.2]
		[ 797.  3849.4]
	K Fold ROC AUC average:  0.6618426085579456


## Binary Classifier for $<=2$ and $>2$

In [9]:
def binary_classifier_two():
    
    cutoff = 2
    
    print(f'Cutoff: 2')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    model = LogisticRegression( C=0.3, fit_intercept=False)
    
    print('Logistic Regression')

    kfold(model,updated_label)
    
    model = Perceptron()

    print('Perceptron')

    kfold(model,updated_label)

    model = LinearSVC()

    print('Linear SVC')

    kfold(model,updated_label)

    best_model = LogisticRegression( C=0.3, fit_intercept=False)

    model_to_csv(best_model, updated_label, cutoff)    
    
binary_classifier_two()

Cutoff: 2
Logistic Regression
	K Fold F1 average:  0.726079198434423
	K Fold Accuracy average:  0.7363094863936062
	K Fold Precision average:  0.7278566800997289
	K Fold CM average: 
		[1679.   704.2]
		[ 835.2 2619.4]
	K Fold ROC AUC average:  0.7302619482258093
Perceptron
	K Fold F1 average:  0.6654991161002977
	K Fold Accuracy average:  0.6772116402181616
	K Fold Precision average:  0.6677924578229741
	K Fold CM average: 
		[1675.   708.2]
		[1176.2 2278.4]
	K Fold ROC AUC average:  0.6757915881756642
Linear SVC
	K Fold F1 average:  0.696626657910855
	K Fold Accuracy average:  0.7079772262368278
	K Fold Precision average:  0.6990062168110781
	K Fold CM average: 
		[1679.8  703.4]
		[1001.4 2453.2]
	K Fold ROC AUC average:  0.7032986340580065


## Binary Classifier for $<=3$ and $>3$

In [10]:
def binary_classifier_three():
    
    cutoff = 3
    
    print(f'Cutoff: 3')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    model = LogisticRegression(solver = 'liblinear')

    print('Logistic Regression')
    
    kfold(model,updated_label)

    model = Perceptron()

    print('Perceptron')

    kfold(model,updated_label)

    model = LinearSVC()

    print('Linear SVC')

    kfold(model,updated_label)
    
    best_model = LogisticRegression(solver = 'liblinear')
    model_to_csv(best_model, updated_label, cutoff)
    
binary_classifier_three()


Cutoff: 3
Logistic Regression
	K Fold F1 average:  0.7529997162180466
	K Fold Accuracy average:  0.7865663180559592
	K Fold Precision average:  0.7819252085073817
	K Fold CM average: 
		[3182.8  372.8]
		[ 873.2 1409. ]
	K Fold ROC AUC average:  0.7509548001096334
Perceptron
	K Fold F1 average:  0.6957996614261592
	K Fold Accuracy average:  0.7311343279570034
	K Fold Precision average:  0.71014242273969
	K Fold CM average: 
		[2952.4  603.2]
		[ 966.4 1315.8]
	K Fold ROC AUC average:  0.6982575866201061
Linear SVC
	K Fold F1 average:  0.7353450663951919
	K Fold Accuracy average:  0.7682035658337913
	K Fold Precision average:  0.7536118243371192
	K Fold CM average: 
		[3076.4  479.2]
		[ 874.  1408.2]
	K Fold ROC AUC average:  0.7354036497547508


## Binary Classifier for $<=4$ and $>4$

In [11]:
def binary_classifier_four():
    
    cutoff = 4
    
    print(f'Cutoff: 4')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    model = LogisticRegression()

    print('Logistic Regression')
    
    kfold(model,updated_label)

    model = Perceptron()

    print('Perceptron')

    kfold(model,updated_label)

    model = LinearSVC()

    print('Linear SVC')

    kfold(model,updated_label)
    
    best_model = LogisticRegression()
    model_to_csv(best_model, updated_label, cutoff)
    
binary_classifier_four()


Cutoff: 4
Logistic Regression
	K Fold F1 average:  0.748666126381887
	K Fold Accuracy average:  0.8753621435312164
	K Fold Precision average:  0.8504323699466927
	K Fold CM average: 
		[4597.2  112.2]
		[615.4 513. ]
	K Fold ROC AUC average:  0.7143237049479326
Perceptron
	K Fold F1 average:  0.7031473978579879
	K Fold Accuracy average:  0.834080272432486
	K Fold Precision average:  0.7272768547130971
	K Fold CM average: 
		[4328.   381.4]
		[587.2 541.2]
	K Fold ROC AUC average:  0.6975559261394103
Linear SVC
	K Fold F1 average:  0.7380234689596734
	K Fold Accuracy average:  0.8613506130898898
	K Fold Precision average:  0.7952024853978947
	K Fold CM average: 
		[4478.4  231. ]
		[578.4 550. ]
	K Fold ROC AUC average:  0.7170190461514305


# Multiclass Classifier

In [21]:
def multiclass_evaluate(labels, predictions):
    assert len(labels) == len(predictions), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    cm = confusion_matrix(labels, predictions)
    return accuracy, precision, recall, f1, cm

# had to make new kfold because old one didnt use .iloc
def multiclass_kfold(model):
    kf = KFold(n_splits=5)
    
    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : [],
               'CM' : [],
               'ROC_AUC Score' : []}

    for train_index, test_index in kf.split(train_features, train_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = train_label.iloc[train_index]
        y_test = train_label.iloc[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f, cm = multiclass_evaluate(y_test, predictions)
        if (type(model) == type (LogisticRegression())):
            predict_proba = model.predict_proba(x_test)
            roc_auc = roc_auc_score(y_test, predict_proba, multi_class = 'ovr')
        else:
            roc_auc = 0
        results['Accuracy'].append(a)
        results['Precision'].append(p)
        results['Recall'].append(r)
        results['F1'].append(f)
        results['CM'].append(cm)
        results['ROC_AUC Score'].append(roc_auc)
        
    print('\tK Fold F1 average: ', sum(results['F1'])/len(results['F1']))
    print('\tK Fold Accuracy average: ', sum(results['Accuracy'])/len(results['Accuracy']))
    print('\tK Fold Precision average: ', sum(results['Precision'])/len(results['Precision']))
    total_cm = np.zeros((5,5))
    for element in results['CM']:
        total_cm = np.add(total_cm, element)
    average_cm = total_cm * 0.2
    print('\tK Fold CM average: ')
    print(f'\t\t{average_cm[0]}')
    print(f'\t\t{average_cm[1]}')
    print(f'\t\t{average_cm[2]}')
    print(f'\t\t{average_cm[3]}')
    print(f'\t\t{average_cm[4]}')
    print('\tK Fold ROC AUC average: ', sum(results['ROC_AUC Score'])/len(results['ROC_AUC Score']))

def multiclass():
    model = LogisticRegression(multi_class='ovr', class_weight = 'balanced',random_state=1, C=0.45)
    
    multiclass_kfold(model)

    model = Perceptron()
    multiclass_kfold(model)

    model = LinearSVC()
    multiclass_kfold(model)
    
    best_model = LogisticRegression(multi_class='ovr', class_weight = 'balanced',random_state=1, C=0.45)
    model_to_csv(best_model, train_label, 'multiclass')

multiclass()


	K Fold F1 average:  0.47509683012700155
	K Fold Accuracy average:  0.4718603423142687
	K Fold Precision average:  0.5125710576074567
	K Fold CM average: 
		[703.6 261.8 106.6  65.2  54.2]
		[399.4 409.8 236.  108.8  37.8]
		[226.8 223.  494.  180.2  48.4]
		[188.2 115.  201.6 460.8 188.2]
		[164.6  61.2  54.8 161.4 686.4]
	K Fold ROC AUC average:  0.799475131376313
	K Fold F1 average:  0.40827611691786486
	K Fold Accuracy average:  0.4066976898913577
	K Fold Precision average:  0.4387882987795069
	K Fold CM average: 
		[580.4 260.4 159.6 102.2  88.8]
		[361.4 383.6 220.6 140.   86.2]
		[258.6 207.8 428.6 188.4  89. ]
		[223.4 134.4 175.6 419.8 200.6]
		[195.   95.8  85.  190.8 561.8]
	K Fold ROC AUC average:  0.0
	K Fold F1 average:  0.44178797525430386
	K Fold Accuracy average:  0.43941615204373374
	K Fold Precision average:  0.4778079921828221
	K Fold CM average: 
		[653.4 259.2 132.2  81.4  65.2]
		[380.  399.6 230.6 119.2  62.4]
		[248.6 224.  454.6 177.   68.2]
		[209.  130.  188

# Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import rand_score
from scipy.sparse import hstack


cluster_data = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=(5,6,11), skiprows=1).fillna('').values

cluster_df = pd.DataFrame(cluster_data, columns = ['reviewText', 'summary', 'category'])
y_cluster_df = cluster_df[['category']].copy()

vectorizer_detail = TfidfVectorizer(stop_words='english', min_df = 0.2, max_df = 0.9)
train_detail = vectorizer_detail.fit_transform(cluster_df[['reviewText']].reviewText.tolist())

vectorizer_summary = TfidfVectorizer(stop_words='english', min_df = 0.1, max_df = 0.9)
train_summary = vectorizer_summary.fit_transform(cluster_df[['summary']].summary.tolist())

train_features = hstack((train_detail, train_summary)).tocsr()


label = set(y_cluster_df.category)
num_labels = len(label)
label = np.array(list(label))
indexes = list(range(0,num_labels))
label = label.reshape([num_labels])
y_cluster_df.category.replace(label, indexes, inplace=True)

In [None]:
def clustering():
    kmeans = KMeans(n_clusters=num_labels)
    kmeans.fit(train_features)
    print(f'Silhouette Score: {silhouette_score(train_features, kmeans.labels_)}')
    print(f'Random Score: {rand_score(np.array(y_cluster_df).flatten(), kmeans.labels_)}')
    
clustering()


Silhouette Score: 0.9554411607742243
Random Score: 0.6250878466819787
