In [1]:
import autograd.numpy as np
from autograd import grad 
import matplotlib.pyplot as plt
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
train = pd.read_table('Training.csv', dtype=str, header=None, sep=',', usecols=(0,6), skiprows=1).values
test = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=[5], skiprows=1).values

In [3]:
# from https://colab.research.google.com/drive/11bLTHJCJ8VBk0rO_7RvVkzwlHRtSXx8f

from sklearn.feature_extraction.text import TfidfVectorizer

train_df = pd.DataFrame(train, columns = ['overall', 'reviewText'])
x_train_df = train_df[['reviewText']].copy()
y_train_df = train_df[['overall']].copy().astype(float)

test_df = pd.DataFrame(test, columns = ['reviewText'])
x_test_df = test_df[['reviewText']].copy()

vectorizer = TfidfVectorizer(min_df = 20, smooth_idf=False)
train_features = vectorizer.fit_transform(x_train_df.reviewText.tolist())
test_features = vectorizer.transform(x_test_df.reviewText.tolist())


#  Binary Classifier

In [4]:
from sklearn.metrics import mean_squared_error, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import pandas as pd
import random
from sklearn.model_selection import KFold

def evaluate(labels, predictions):
    assert len(labels) == len(predictions), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)
    return accuracy, precision, recall, f1


In [5]:
def test(model, updated_label):
    x_train, x_test, y_train, y_test = train_test_split(train_features,updated_label,test_size=0.2)

    model.fit(x_train, y_train)
    predict_y = model.predict(x_test)

    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}



    a, p, r, f = evaluate(np.array(y_test), np.array([predict_y]).T,)
    results['Accuracy'].append(a)
    results['Precision'].append(p)
    results['Recall'].append(r)
    results['F1'].append(f)
    print(f'F1: {f}')

# Binary Classifier for $<=1$ and $>1$

In [24]:

def binary_classifier_one():
    
    cutoff = 1
    
    print(f'Cutoff: 1')

    updated_label = y_train_df.overall.copy()
    updated_label.loc[y_train_df.overall>cutoff] = 1
    updated_label.loc[y_train_df.overall<=cutoff] = 0
    
    kf = KFold(n_splits=5)
    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    
    model = LogisticRegression(solver = 'liblinear')
    
    test(model, updated_label)
    
    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)
    
    

binary_classifier_one()


Cutoff: 1
F1: 0.6824957744376137
   F1: 0.5520708723644641
   F1: 0.3267753541562427
   F1: 0.6293711651459004
   F1: 0.683734818000248
   F1: 0.7150723289277475
K Fold F1 average:  0.5814049077189205


# Binary Classifier for $<=2$ and $>2$

In [7]:
def binary_classifier_two():
    
    cutoff = 2
    
    print(f'Cutoff: 2')
    
    updated_label = y_train_df.overall.copy()
    updated_label.loc[y_train_df.overall>cutoff] = 1
    updated_label.loc[y_train_df.overall<=cutoff] = 0
    
    kf = KFold(n_splits=5)
    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    
    model = LogisticRegression( C=0.3, fit_intercept=False)
    test(model, updated_label)
    
    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)
    
binary_classifier_two()

Cutoff: 2
F1: 0.7729558342536497
   F1: 0.7256232122049562
   F1: 0.4134966025497604
   F1: 0.4976236322513602
   F1: 0.7469722496589832
   F1: 0.8018710047157971
K Fold F1 average:  0.6371173402761714


# Binary Classifier for $<=3$ and $>3$

In [8]:
def binary_classifier_three():
    
    cutoff = 3
    
    print(f'Cutoff: 3')
    
    kf = KFold(n_splits=5)
    
    updated_label = y_train_df.overall.copy()
    updated_label.loc[y_train_df.overall>cutoff] = 1
    updated_label.loc[y_train_df.overall<=cutoff] = 0

    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    
    model = LogisticRegression()
    test(model, updated_label)
    
    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)

binary_classifier_three()

Cutoff: 3
F1: 0.7947057542619012
   F1: 0.7732696709960016
   F1: 0.5604721230622371
   F1: 0.5313699749669165
   F1: 0.6413807052455052
   F1: 0.8001013346836648
K Fold F1 average:  0.6613187617908651


# Binary Classifier for $<=4$ and $>4$

In [9]:
def binary_classifier_four():
    
    cutoff = 4
    
    print(f'Cutoff: 4')
    
    updated_label = y_train_df.overall.copy()
    updated_label.loc[y_train_df.overall>cutoff] = 1
    updated_label.loc[y_train_df.overall<=cutoff] = 0
    
    kf = KFold(n_splits=5)
    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    
    model = LogisticRegression()
    test(model, updated_label)
    
    for train_index, test_index in kf.split(train_features, updated_label):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = updated_label[train_index]
        y_test = updated_label[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    
    model.fit(train_features, updated_label)
    y_pred = model.predict(test_features).astype(int)
    
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)

binary_classifier_four()

Cutoff: 4
F1: 0.7133234796921968
   F1: 0.6824148230725517
   F1: 0.5825008972328026
   F1: 0.6170903063408123
   F1: 0.642882884723689
   F1: 0.6041408915952837
K Fold F1 average:  0.625805960593028


In [10]:
binary_classifier_one()
binary_classifier_two()
binary_classifier_three()
binary_classifier_four()

Cutoff: 1
F1: 0.7013603847195488
   F1: 0.5876850768972053
   F1: 0.3292186325676981
   F1: 0.6335575653665068
   F1: 0.6925964957685498
   F1: 0.7240181902720602
K Fold F1 average:  0.5934151921744041
Cutoff: 2
F1: 0.7737099612841978
   F1: 0.7256232122049562
   F1: 0.4134966025497604
   F1: 0.4976236322513602
   F1: 0.7469722496589832
   F1: 0.8018710047157971
K Fold F1 average:  0.6371173402761714
Cutoff: 3
F1: 0.7963829432161642
   F1: 0.7732696709960016
   F1: 0.5604721230622371
   F1: 0.5313699749669165
   F1: 0.6413807052455052
   F1: 0.8001013346836648
K Fold F1 average:  0.6613187617908651
Cutoff: 4
F1: 0.7127361619293686
   F1: 0.6824148230725517
   F1: 0.5825008972328026
   F1: 0.6170903063408123
   F1: 0.642882884723689
   F1: 0.6041408915952837
K Fold F1 average:  0.625805960593028


# Multiclass Classifier

In [11]:
def multiclass():
    model = LogisticRegression(multi_class='ovr', class_weight = 'balanced',random_state=1, C=0.45)
    
    kf = KFold(n_splits=5)
    
    results1 = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}

    for train_index, test_index in kf.split(train_features, y_train_df):
        x_train = train_features[train_index]
        x_test = train_features[test_index]
        y_train = y_train_df.iloc[train_index]
        y_test = y_train_df.iloc[test_index]
        model.fit(x_train, y_train)
        predictions = model.predict(x_test)
        a, p, r, f = evaluate(y_test, predictions)
        results1['Accuracy'].append(a)
        results1['Precision'].append(p)
        results1['Recall'].append(r)
        results1['F1'].append(f)
        print(f'   F1: {f}')
        
    print('K Fold F1 average: ', sum(results1['F1'])/len(results1['F1']))
    
    model.fit(train_features, y_train_df)
    predict_y = model.predict(test_features).astype(int)
    out = {'id': range(len(predict_y)), 'predicted': predict_y}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'multiclass.csv', index=False)

multiclass()

   F1: 0.3887387211013644
   F1: 0.15224709699722375
   F1: 0.29854666903095567
   F1: 0.36808813723612044
   F1: 0.45312090543406025
K Fold F1 average:  0.3321483059599449


# Clustering

In [99]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import rand_score

cluster_data = pd.read_table('Test.csv', dtype=str, header=None, sep=',', usecols=(5,11), skiprows=1).values

cluster_df = pd.DataFrame(cluster_data, columns = ['reviewText', 'category'])
x_cluster_df = cluster_df[['reviewText']].copy()
y_cluster_df = cluster_df[['category']].copy()

vectorizer = TfidfVectorizer(stop_words='english', min_df = 0.1, max_df = 0.9)
train = vectorizer.fit_transform(x_cluster_df.reviewText.tolist())
label = set(y_cluster_df.category)
num_labels = len(label)
label = np.array(list(label))
indexes = list(range(0,num_labels))
label = label.reshape([num_labels])
y_cluster_df.category.replace(label, indexes, inplace=True)

6


In [109]:
def clustering():
    kmeans = KMeans(n_clusters=num_labels)
    kmeans.fit(train)
    print(kmeans.labels_)
    print(silhouette_score(train, kmeans.labels_))
    print(rand_score(np.array(y_cluster_df).flatten(), kmeans.labels_))
    
clustering()


[0 0 0 ... 2 0 5]
0.5979664135765153
0.6589795263144896
