In [53]:
import autograd.numpy as np
from autograd import grad 
import matplotlib.pyplot as plt
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [54]:
train = pd.read_table('Training.csv', dtype=str, header=None, sep=',', usecols=(0,6,7), skiprows=1).fillna('').values
test = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=(5,6), skiprows=1).fillna('').values

In [55]:
# from https://colab.research.google.com/drive/11bLTHJCJ8VBk0rO_7RvVkzwlHRtSXx8f

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

train_df = pd.DataFrame(train)
train_df.columns = ['overall', 'reviewText', 'summary']
test_df = pd.DataFrame(test)
test_df.columns = ['reviewText', 'summary']

vectorizer_detail = TfidfVectorizer(min_df = 20)
train_detail = vectorizer_detail.fit_transform(train_df[['reviewText']].reviewText.tolist())
test_detail = vectorizer_detail.transform(test_df[['reviewText']].reviewText.tolist())

vectorizer_summary = TfidfVectorizer(min_df = 20)
train_summary = vectorizer_summary.fit_transform(train_df[['summary']].summary.tolist())
test_summary = vectorizer_summary.transform(test_df[['summary']].summary.tolist())

train_features = hstack((train_detail, train_summary)).tocsr()
train_label = train_df[['overall']].copy().astype(float)
test_features = hstack((test_detail, test_summary)).tocsr()


#  Binary Classifier

In [56]:
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import pandas as pd
import random
from sklearn.model_selection import KFold

def evaluate(labels, predictions):
    assert len(labels) == len(predictions), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    return accuracy, precision, recall, f1


In [57]:
def test(model, updated_label):
    x_train, x_test, y_train, y_test = train_test_split(train_features,updated_label,test_size=0.2)

    model.fit(x_train, y_train)
    predict_y = model.predict(x_test)

    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}



    a, p, r, f = evaluate(np.array(y_test), np.array([predict_y]).T,)
    results['Accuracy'].append(a)
    results['Precision'].append(p)
    results['Recall'].append(r)
    results['F1'].append(f)
    print(f'F1: {f}')

# Binary Classifier for $<=1$ and $>1$

In [58]:

def binary_classifier_one():
    
    cutoff = 1
    
    print(f'Cutoff: 1')

    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    kf = KFold(n_splits=5)
    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    
    model = LogisticRegression(solver = 'liblinear')
    
    test(model, updated_label)
    
    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)
    
    

binary_classifier_one()


Cutoff: 1
F1: 0.776263584470558
   F1: 0.6485889611409403
   F1: 0.40639473863894493
   F1: 0.7528080372608645
   F1: 0.7875538146274232
   F1: 0.8022056772322512
K Fold F1 average:  0.6795102457800848


# Binary Classifier for $<=2$ and $>2$

In [59]:
def binary_classifier_two():
    
    cutoff = 2
    
    print(f'Cutoff: 2')
    
    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    kf = KFold(n_splits=5)
    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    
    model = LogisticRegression( C=0.3, fit_intercept=False)
    test(model, updated_label)
    
    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)
    
binary_classifier_two()

Cutoff: 2
F1: 0.8356748899330111
   F1: 0.7982101859929249
   F1: 0.516615684674619
   F1: 0.6357254683998361
   F1: 0.8201364421774707
   F1: 0.8597082109272643
K Fold F1 average:  0.726079198434423


# Binary Classifier for $<=3$ and $>3$

In [60]:
def binary_classifier_three():
    
    cutoff = 3
    
    print(f'Cutoff: 3')
    
    kf = KFold(n_splits=5)
    
    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0

    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    
    model = LogisticRegression()
    test(model, updated_label)
    
    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)

binary_classifier_three()

Cutoff: 3
F1: 0.8584096866187871
   F1: 0.8548864407945396
   F1: 0.639764202460029
   F1: 0.6569790071465034
   F1: 0.7426525264440687
   F1: 0.8699110366189469
K Fold F1 average:  0.7528386426928175


# Binary Classifier for $<=4$ and $>4$

In [61]:
def binary_classifier_four():
    
    cutoff = 4
    
    print(f'Cutoff: 4')
    
    updated_label = train_label.overall.copy()
    updated_label.loc[train_label.overall>cutoff] = 1
    updated_label.loc[train_label.overall<=cutoff] = 0
    
    kf = KFold(n_splits=5)
    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    
    model = LogisticRegression()
    test(model, updated_label)
    
    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)

binary_classifier_four()

Cutoff: 4
F1: 0.8102171271913894
   F1: 0.8146964379012483
   F1: 0.6753468882262408
   F1: 0.7565227148796068
   F1: 0.7393690731668783
   F1: 0.757395517735461
K Fold F1 average:  0.748666126381887


In [62]:
binary_classifier_one()
binary_classifier_two()
binary_classifier_three()
binary_classifier_four()

Cutoff: 1
F1: 0.7839465074868586
   F1: 0.6485889611409403
   F1: 0.40639473863894493
   F1: 0.7528080372608645
   F1: 0.7875538146274232
   F1: 0.8022056772322512
K Fold F1 average:  0.6795102457800848
Cutoff: 2
F1: 0.8309876540776432
   F1: 0.7982101859929249
   F1: 0.516615684674619
   F1: 0.6357254683998361
   F1: 0.8201364421774707
   F1: 0.8597082109272643
K Fold F1 average:  0.726079198434423
Cutoff: 3
F1: 0.8583536501849063
   F1: 0.8548864407945396
   F1: 0.639764202460029
   F1: 0.6569790071465034
   F1: 0.7426525264440687
   F1: 0.8699110366189469
K Fold F1 average:  0.7528386426928175
Cutoff: 4
F1: 0.8067975833481889
   F1: 0.8146964379012483
   F1: 0.6753468882262408
   F1: 0.7565227148796068
   F1: 0.7393690731668783
   F1: 0.757395517735461
K Fold F1 average:  0.748666126381887


# Multiclass Classifier

In [63]:
def multiclass():
    model = LogisticRegression(multi_class='ovr', class_weight = 'balanced',random_state=1, C=0.45)
    
    kf = KFold(n_splits=5)
    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    for train_index, test_index in kf.split(train_features, train_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = train_label.iloc[train_index]
        y_test = train_label.iloc[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    
    model.fit(train_features, train_label)
    predict_y = model.predict(test_features).astype(int)
    out = {'id': range(len(predict_y)), 'predicted': predict_y}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'multiclass.csv', index=False)

multiclass()

   F1: 0.5339928768792294
   F1: 0.27397658587221635
   F1: 0.47127206908753055
   F1: 0.5096826589162411
   F1: 0.5865599598797906
K Fold F1 average:  0.47509683012700155


# Clustering

In [64]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import rand_score

cluster_data = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=(5,11), skiprows=1).values

cluster_df = pd.DataFrame(cluster_data, columns = ['reviewText', 'category'])
x_cluster_df = cluster_df[['reviewText']].copy()
y_cluster_df = cluster_df[['category']].copy()

vectorizer = TfidfVectorizer(stop_words='english', min_df = 0.1, max_df = 0.9)
train = vectorizer.fit_transform(x_cluster_df.reviewText.tolist())
label = set(y_cluster_df.category)
num_labels = len(label)
label = np.array(list(label))
indexes = list(range(0,num_labels))
label = label.reshape([num_labels])
y_cluster_df.category.replace(label, indexes, inplace=True)

In [65]:
def clustering():
    kmeans = KMeans(n_clusters=num_labels)
    kmeans.fit(train)
    print(kmeans.labels_)
    print(silhouette_score(train, kmeans.labels_))
    print(rand_score(np.array(y_cluster_df).flatten(), kmeans.labels_))
    
clustering()


[0 0 0 ... 1 0 3]
0.5979664135765153
0.6589795263144896
