# Import Data

In [None]:
import pandas as pd

tfidf_train = pd.read_csv("train_tfidf.csv", header = 0, converters={'tweet': lambda x: x[2:-2].split('), (')})
tfidf_dev = pd.read_csv("dev_tfidf.csv", header = 0, converters={'tweet': lambda x: x[2:-2].split('), (')})
tfidf_test = pd.read_csv("test_tfidf.csv", header = 0, converters={'tweet': lambda x: x[2:-2].split('), (')})

word_count_train = pd.read_csv("train_count.csv", header = 0, converters={'tweet': lambda x: x[2:-2].split('), (')})
word_count_dev = pd.read_csv("dev_count.csv", header = 0, converters={'tweet': lambda x: x[2:-2].split('), (')})
word_count_test = pd.read_csv("test_count.csv", header = 0, converters={'tweet': lambda x: x[2:-2].split('), (')})

glove_train = pd.read_csv("train_glove.csv", header = 0, converters={'tweet': lambda x: x[1:-1].split(',')})
glove_dev = pd.read_csv("dev_glove.csv", header = 0, converters={'tweet': lambda x: x[1:-1].split(',')})
glove_test = pd.read_csv("test_glove.csv", header = 0, converters={'tweet': lambda x: x[1:-1].split(',')})


#rename columns for clarity
word_count_train.rename(columns={'tweet': 'word_count'}, inplace=True)
word_count_dev.rename(columns={'tweet': 'word_count'}, inplace=True)
word_count_test.rename(columns={'tweet': 'word_count'}, inplace=True)

glove_train.rename(columns={'tweet': 'glove'}, inplace=True)
glove_dev.rename(columns={'tweet': 'glove'}, inplace=True)
glove_test.rename(columns={'tweet': 'glove'}, inplace=True)

tfidf_train.rename(columns={'tweet': 'tfidf'}, inplace=True)
tfidf_dev.rename(columns={'tweet': 'tfidf'}, inplace=True)
tfidf_test.rename(columns={'tweet': 'tfidf'}, inplace=True)

y_train = word_count_train['sentiment']
y_dev = word_count_dev['sentiment']
y_test = word_count_test['sentiment']

test_id = word_count_test['tweet_id']


In [None]:
vocab_file = open('vocab.txt', 'r')
vocab_lines = vocab_file.readlines()
vocab_set = []
vocab_list = []

for line in vocab_lines:
    line = line.strip()
    line = line.split('\t')
    word = line[0]
    word_id = int(line[1])
    vocab_set.append((word_id,word))
    
vocab_set.sort()

for (index,word) in vocab_set:
    vocab_list.append(word)

# Prepare Data (To Array)

In [None]:
def toTupleArray(data, type):
    data_clean = []
    for instance in data:
        instance_new = []
        for word in instance:
            word = word.strip()
            word = word.split(",")
            instance_new.append(((int(word[0])), type(word[1])))
        data_clean.append(instance_new)
    return data_clean

def toArray(data):
    data_clean = []
    for instance in data:
        instance_new =[]
        for index in instance:
            instance_new.append(float(index))
        data_clean.append(instance_new)    
    return data_clean


w_train = word_count_train['word_count']
w_dev = word_count_dev['word_count']
w_test = word_count_test['word_count']

t_train = tfidf_train['tfidf']
t_dev = tfidf_dev['tfidf']
t_test = tfidf_test['tfidf']

g_train = glove_train['glove']
g_dev = glove_dev['glove']
g_test = glove_test['glove']

w_train = toTupleArray(w_train, int)
w_dev = toTupleArray(w_dev, int)
w_test = toTupleArray(w_test, int)

t_train = toTupleArray(t_train, float)
t_dev = toTupleArray(t_dev, float)
t_test = toTupleArray(t_test, float)

g_train = toArray(g_train)
g_dev = toArray(g_dev)
g_test = toArray(g_test)


# Vectorize Data (for TF-IDF and Word Count)

In [None]:
def vectorize5000(array):
    matrix = [[0 for i in range(5000)] for i in range(len(array))]
    for instance_index in range(len(array)):
        for word in array[instance_index]:
            matrix[instance_index][word[0]] = word[1]
    return matrix
        
t_train = vectorize5000(t_train)
t_dev = vectorize5000(t_dev)
t_test = vectorize5000(t_test)

w_train = vectorize5000(w_train)
w_dev = vectorize5000(w_dev)
w_test = vectorize5000(w_test)

# Normalize Data (for Glove)

In [None]:
import numpy as np

In [None]:
features_instances = [ [] for _ in range(len(g_train[0]))]
    
for instance in g_train:
    for feature_index in range(len(g_train[0])):
        features_instances[feature_index].append(instance[feature_index])

#find mean and standard deviation of train data
mean = []
stdev = []
for feature in features_instances:
    mean.append(sum(feature)/len(feature))
    stdev.append(np.std(feature))

g_full = g_train + g_dev + g_test

g_norm = []

#standardise using the mean and standard deviation of train data
for instance in g_full:
    norm_instance = []
    for feature_index in range(len(g_full[0])):
        norm_instance.append((instance[feature_index]-mean[feature_index])/stdev[feature_index])
    g_norm.append(norm_instance)

g_train = g_norm[0:len(g_train)]
g_dev = g_norm[len(g_train):(len(g_train)+len(g_dev))]
g_test = g_norm[(len(g_train)+len(g_dev))::]

# Feature Sets    

As there are only 3 features, the wrapper method will be used for feature selection. There are 7 possible combination of features: word_count only (w), tfidf only (t), glove only (g), word_count & tfidf (wt), word_count & glove (wg), tfidf & glove (tg), word_count, tfidf, & glove (wtg).

As word_count and tfidf are likely to be correlated, combinations with them together are excluded. 
Thus the following are the feature sets that will be tested
1. word_count only (w)
2. tfidf only (t)
3. glove only (g)
4. word_count & glove (wg)
5. tfidf & glove (tg)

In [None]:
assert(len(w_train) == len(t_train) == len(g_train))
assert(len(w_dev) == len(t_dev) == len(g_dev))
assert(len(w_test) == len(t_test) == len(g_test))

wg_train = []
wg_dev = []
wg_test = []

tg_train = []
tg_dev = []
tg_test = []

for instance_index in range(len(w_train)):
    instance_wg = w_train[instance_index] + g_train[instance_index]
    instance_tg = t_train[instance_index] + g_train[instance_index]
    
    wg_train.append(instance_wg)
    tg_train.append(instance_tg)

for instance_dev in range(len(w_dev)):  
    instance_wg = w_dev[instance_dev] + g_dev[instance_dev]
    instance_tg = t_dev[instance_dev] + g_dev[instance_dev]

    wg_dev.append(instance_wg)
    tg_dev.append(instance_tg)
    
for instance_test in range(len(w_test)):
    instance_wg = w_test[instance_test] + g_test[instance_test]
    instance_tg = t_test[instance_test] + g_test[instance_test]
    
    wg_test.append(instance_wg)
    tg_test.append(instance_tg)


print("word_count only train shape: ", (len(w_train), len(w_train[0])))
print("word_count only dev shape: ", (len(w_dev), len(w_dev[0])))
print("word_count only test shape: ", (len(w_test), len(w_test[0])))

print("tfidf only train shape: ", (len(t_train), len(t_train[0])))
print("tfidf only dev shape: ", (len(t_dev), len(t_dev[0])))
print("tfidf only test shape: ", (len(t_test), len(t_test[0])))

print("glove only train shape: ", (len(g_train), len(g_train[0])))
print("glove only dev shape: ", (len(g_dev), len(g_dev[0])))
print("glove only test shape: ", (len(g_test), len(g_test[0])))

print("word_count and glove train shape: ", (len(wg_train), len(wg_train[0])))
print("word_count and glove dev shape: ", (len(wg_dev), len(wg_dev[0])))
print("word_count and glove test shape: ", (len(wg_test), len(wg_test[0])))

print("tfidf and glove train shape: ", (len(tg_train), len(tg_train[0])))
print("tfidf and glove dev shape: ", (len(tg_dev), len(tg_dev[0])))
print("tfidf and glove test shape: ", (len(tg_test), len(tg_test[0])))


# To Save as CSV

In [None]:
import csv
def savePrediction(fileName, prediction):
    assert(len(test_id)==len(prediction))
    with open(fileName, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["tweet_id", "sentiment"])
        for prediction_index in range(len(prediction)):
            row = [test_id[prediction_index], prediction[prediction_index]] 
            writer.writerow(row)

# Baseline

In [None]:
from collections import Counter
from random import random
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

<b> Baseline 1: Weighted Random Baseline </b>

In [None]:
count_instances_train = len(y_train)
count_instances_dev = len(y_dev)
baseline_weight = Counter(y_train)

pos_prob = baseline_weight["pos"]/count_instances_train
neu_prob = baseline_weight["neu"]/count_instances_train
neg_prob = baseline_weight["neg"]/count_instances_train

assert( pos_prob + neu_prob  + neg_prob  == 1)

random_baseline = []

for instance in range(count_instances_dev):
    prob = random()
    if prob < pos_prob:
        random_baseline.append("pos")
    elif ((prob >= pos_prob) and (prob < (pos_prob + neu_prob))):
        random_baseline.append("neu")
    elif (prob >= (pos_prob + neu_prob)):
        random_baseline.append("neg")

random_baseline_acc = accuracy_score(random_baseline, y_dev)
random_baseline_pre = precision_score(random_baseline, y_dev, average='macro')
random_baseline_rec = recall_score(random_baseline, y_dev, average='macro')
print("Random Baseline accuracy: ", random_baseline_acc )
print("Random Baseline precision: ", random_baseline_pre )
print("Random Baseline recall: ", random_baseline_rec )

<b> Baseline 2: One-R Baseline </b>

In [None]:
one_r = baseline_weight.most_common(1)[0][0]
print(baseline_weight)
print(one_r)

one_r_baseline = [one_r for i in range(count_instances_dev)]

one_r_baseline_acc = accuracy_score(one_r_baseline, y_dev)
print("One-R Baseline accuracy: ", one_r_baseline_acc)

# 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import eli5

<b>1.1 Logistic Regression on Word Count only</b>

In [None]:
w_lr = LogisticRegression(solver="saga", max_iter=1000)

w_lr.fit(w_train,y_train)

In [None]:
w_lr_dev = w_lr.predict(w_dev)

In [None]:
w_lr_dev_acc = accuracy_score(w_lr_dev, y_dev)
w_lr_dev_pre = precision_score(w_lr_dev, y_dev, average="macro")
w_lr_dev_rec = recall_score(w_lr_dev, y_dev, average="macro")
print("Logistic Regression using Word Count accuracy: ", w_lr_dev_acc)
print("Logistic Regression using Word Count precision: ", w_lr_dev_pre)
print("Logistic Regression using Word Count recall: ", w_lr_dev_rec)

In [None]:
w_lr_test = w_lr.predict(w_test)
savePrediction("w_lr.csv", w_lr_test)

<b>1.2 Logistic Regression on TF-IDF only</b>

In [None]:
t_lr = LogisticRegression(solver="saga", max_iter=1000)

t_lr.fit(t_train,y_train)

In [None]:
t_lr_dev = t_lr.predict(t_dev)

In [None]:
t_lr_dev_acc = accuracy_score(t_lr_dev, y_dev)
t_lr_dev_pre = precision_score(t_lr_dev, y_dev, average="macro")
t_lr_dev_rec = recall_score(t_lr_dev, y_dev, average="macro")
print("Logistic Regression using TF-IDF accuracy: ", t_lr_dev_acc)
print("Logistic Regression using TF-IDF precision: ", t_lr_dev_pre)
print("Logistic Regression using TF-IDF recall: ", t_lr_dev_rec)

In [None]:
eli5.show_weights(estimator=t_lr, 
                  feature_names= list(vocab_list),
                 top=(50, 5))

In [None]:
t_lr_test = t_lr.predict(t_test)
savePrediction("t_lr.csv", t_lr_test)

<b> 1.3 Logistic Regression on Glove only

In [None]:
g_lr = LogisticRegression(solver="saga", max_iter=1000)

g_lr.fit(g_train,y_train)

In [None]:
g_lr_dev = g_lr.predict(g_dev)

In [None]:
g_lr_dev_acc = accuracy_score(g_lr_dev, y_dev)
g_lr_dev_pre = precision_score(g_lr_dev, y_dev, average="macro")
g_lr_dev_rec = recall_score(g_lr_dev, y_dev, average="macro")
print("Logistic Regression using Glove accuracy: ", g_lr_dev_acc )
print("Logistic Regression using Glove precision: ", g_lr_dev_pre )
print("Logistic Regression using Glove recall: ", g_lr_dev_rec )

In [None]:
g_lr_test = g_lr.predict(g_test)
savePrediction("g_lr.csv", g_lr_test)

<b> 1.4 Logistic Regression on Word Count and Glove </b>

In [None]:
wg_lr = LogisticRegression(solver="saga", max_iter=1000)

wg_lr.fit(wg_train,y_train)

In [None]:
wg_lr_dev = wg_lr.predict(wg_dev)

In [None]:
wg_lr_dev_acc = accuracy_score(wg_lr_dev, y_dev)
wg_lr_dev_pre = precision_score(wg_lr_dev, y_dev, average="macro")
wg_lr_dev_rec = recall_score(wg_lr_dev, y_dev, average="macro")
print("Logistic Regression using TF-IDF and Glove accuracy: ", wg_lr_dev_acc)
print("Logistic Regression using TF-IDF and Glove precision: ", wg_lr_dev_pre)
print("Logistic Regression using TF-IDF and Glove recall: ", wg_lr_dev_rec )

In [None]:
wg_lr_test = wg_lr.predict(wg_test)
savePrediction("wg_lr.csv", wg_lr_test)

<b> 1.5 Logistic Regression on TF-IDF and Glove </b>

In [None]:
tg_lr = LogisticRegression(solver="saga", max_iter=1000)

tg_lr.fit(tg_train,y_train)

In [None]:
tg_lr_dev = tg_lr.predict(tg_dev)

In [None]:
tg_lr_dev_acc = accuracy_score(tg_lr_dev, y_dev)
tg_lr_dev_pre = precision_score(tg_lr_dev, y_dev, average="macro")
tg_lr_dev_rec = recall_score(tg_lr_dev, y_dev, average="macro")
print("Logistic Regression using TF-IDF and Glove accuracy: ", tg_lr_dev_acc)
print("Logistic Regression using TF-IDF and Glove precision: ", tg_lr_dev_pre)
print("Logistic Regression using TF-IDF and Glove recall: ", tg_lr_dev_rec)

In [None]:
tg_lr_test = tg_lr.predict(tg_test)
savePrediction("tg_lr.csv", tg_lr_test)

# 2. SVM

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

<b> 2.1 SVM on Word Count only </b>

In [None]:
w_svm = LinearSVC(max_iter=1000)

w_svm.fit(w_train, y_train)

In [None]:
w_svm_dev = w_svm.predict(w_dev)

In [None]:
w_svm_dev_acc = accuracy_score(w_svm_dev, y_dev)
w_svm_dev_pre = precision_score(w_svm_dev, y_dev, average="macro")
w_svm_dev_rec = recall_score(w_svm_dev, y_dev, average="macro")

print("SVM using Word Count accuracy: ", w_svm_dev_acc )
print("SVM using Word Count precision: ", w_svm_dev_pre )
print("SVM using Word Count recall: ", w_svm_dev_rec )

In [None]:
w_svm_test = w_svm.predict(w_test)
savePrediction("w_svm.csv", w_svm_test)

<b> 2.2 SVM on TF-IDF only </b>

In [None]:
t_svm = LinearSVC(max_iter=1000)

t_svm.fit(t_train, y_train)

In [None]:
t_svm_dev = t_svm.predict(t_dev)

In [None]:
t_svm_dev_acc = accuracy_score(t_svm_dev, y_dev)
t_svm_dev_pre = precision_score(t_svm_dev, y_dev, average="macro")
t_svm_dev_rec = recall_score(t_svm_dev, y_dev, average="macro")
print("SVM using TF-IDF accuracy: ", t_svm_dev_acc)
print("SVM using TF-IDF precision: ", t_svm_dev_pre)
print("SVM using TF-IDF recall: ", t_svm_dev_rec )

In [None]:
t_svm_test = t_svm.predict(t_test)
savePrediction("t_svm.csv", t_svm_test)

<b> 2.3 SVM on Glove only </b>

In [None]:
g_svm = LinearSVC(max_iter=1000)

g_svm.fit(g_train, y_train)

In [None]:
g_svm_dev = g_svm.predict(g_dev)

In [None]:
g_svm_dev_acc = accuracy_score(g_svm_dev, y_dev)
g_svm_dev_pre = precision_score(g_svm_dev, y_dev, average="macro")
g_svm_dev_rec = recall_score(g_svm_dev, y_dev, average="macro")
print("SVM using Glove accuracy: ", g_svm_dev_acc)
print("SVM using Glove precision: ", g_svm_dev_pre)
print("SVM using Glove recall: ", g_svm_dev_rec)

In [None]:
g_svm_test = g_svm.predict(g_test)
savePrediction("g_svm.csv", g_svm_test)

<b> 2.4 SVM on Word Count and Glove </b>

In [None]:
wg_svm = LinearSVC(max_iter=1000)

wg_svm.fit(wg_train, y_train)

In [None]:
wg_svm_dev = wg_svm.predict(wg_dev)

In [None]:
wg_svm_dev_acc = accuracy_score(wg_svm_dev, y_dev)
wg_svm_dev_pre = precision_score(wg_svm_dev, y_dev, average="macro")
wg_svm_dev_rec = recall_score(wg_svm_dev, y_dev, average="macro")
print("SVM using Word Count and Glove accuracy: ", wg_svm_dev_acc)
print("SVM using Word Count and Glove precisiony: ", wg_svm_dev_pre)
print("SVM using Word Count and Glove recall: ", wg_svm_dev_rec)

In [None]:
wg_svm_test = wg_svm.predict(wg_test)
savePrediction("wg_svm.csv", wg_svm_test)

<b> 2.5 SVM on TF-IDF and Glove </b>

In [None]:
tg_svm = LinearSVC(max_iter=1000)

tg_svm.fit(tg_train, y_train)

In [None]:
tg_svm_dev = tg_svm.predict(tg_dev)

In [None]:
tg_svm_dev_acc = accuracy_score(tg_svm_dev, y_dev)
tg_svm_dev_pre = precision_score(tg_svm_dev, y_dev, average="macro")
tg_svm_dev_rec = recall_score(tg_svm_dev, y_dev, average="macro")
print("SVM using TF-IDF and Glove accuracy: ", tg_svm_dev_acc)
print("SVM using TF-IDF and Glove precision: ", tg_svm_dev_pre)
print("SVM using TF-IDF and Glove recall: ", tg_svm_dev_rec)

In [None]:
tg_svm_test = tg_svm.predict(tg_test)
savePrediction("tg_svm.csv", tg_svm_test)

# 3. Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

<b> 3.1.a Multinomial Naive Bayes on Word Count only </b>

In [None]:
w_mnb = MultinomialNB()

w_mnb.fit(w_train, y_train)

In [None]:
w_mnb_dev = w_mnb.predict(w_dev)

In [None]:
w_mnb_dev_acc = accuracy_score(w_mnb_dev, y_dev)
w_mnb_dev_pre = precision_score(w_mnb_dev, y_dev, average="macro")
w_mnb_dev_rec = recall_score(w_mnb_dev, y_dev, average="macro")
print("Multinomial Naive Bayes using Word Count accuracy: ", w_mnb_dev_acc)
print("Multinomial Naive Bayes using Word Count precision: ", w_mnb_dev_pre)
print("Multinomial Naive Bayes using Word Count recall: ", w_mnb_dev_rec)

In [None]:
w_mnb_test = w_mnb.predict(w_test)
savePrediction("w_mnb.csv", w_mnb_test)

<b> 3.1.b Gaussian Naive Bayes on Word Count only </b>

In [None]:
w_gnb = GaussianNB()

w_gnb.fit(w_train, y_train)

In [None]:
w_gnb_dev = w_gnb.predict(w_dev)

In [None]:
w_gnb_dev_acc = accuracy_score(w_gnb_dev, y_dev)
w_gnb_dev_pre = precision_score(w_gnb_dev, y_dev, average="macro")
w_gnb_dev_rec = recall_score(w_gnb_dev, y_dev, average="macro")
print("Gaussian Naive Bayes using Word Count accuracy: ", w_gnb_dev_acc)
print("Gaussian Naive Bayes using Word Count precision: ", w_gnb_dev_pre)
print("Gaussian Naive Bayes using Word Count recall: ", w_gnb_dev_rec)

<b> 3.2.a Multinomial Naive Bayes on TF-IDF only </b>

In [None]:
t_mnb = MultinomialNB()

t_mnb.fit(t_train, y_train)

In [None]:
t_mnb_dev = t_mnb.predict(t_dev)

In [None]:
t_mnb_dev_acc = accuracy_score(t_mnb_dev, y_dev)
t_mnb_dev_pre = precision_score(t_mnb_dev, y_dev, average="macro")
t_mnb_dev_rec = recall_score(t_mnb_dev, y_dev, average="macro")
print("Multinomial Naive Bayes using TF-IDF accuracy: ", t_mnb_dev_acc)
print("Multinomial Naive Bayes using TF-IDF precision: ", t_mnb_dev_pre)
print("Multinomial Naive Bayes using TF-IDF recall: ", t_mnb_dev_rec)

In [None]:
t_mnb_test = t_mnb.predict(t_test)
savePrediction("t_mnb.csv", t_mnb_test)

<b> 3.2.b. Gaussian Naive Bayes on TF-IDF only </b>

In [None]:
t_gnb = GaussianNB()

t_gnb.fit(t_train, y_train)

In [None]:
t_gnb_dev = t_gnb.predict(t_dev)

In [None]:
t_gnb_dev_acc = accuracy_score(t_gnb_dev, y_dev)
t_gnb_dev_pre = precision_score(t_gnb_dev, y_dev, average="macro")
t_gnb_dev_rec = recall_score(t_gnb_dev, y_dev, average="macro")
print("Gaussian Naive Bayes using TF-IDF accuracy: ", t_gnb_dev_acc)
print("Gaussian Naive Bayes using TF-IDF precision: ", t_gnb_dev_pre)
print("Gaussian Naive Bayes using TF-IDF recall: ", t_gnb_dev_rec)

In [None]:
t_gnb_test = t_gnb.predict(t_test)
savePrediction("t_gnb.csv", t_gnb_test)

<b> 3.3 Gaussian Naive Bayes on Glove only </b>

In [None]:
g_gnb = GaussianNB()

g_gnb.fit(g_train, y_train)

In [None]:
g_gnb_dev = g_gnb.predict(g_dev)

In [None]:
g_gnb_dev_acc = accuracy_score(g_gnb_dev, y_dev)
g_gnb_dev_pre = precision_score(g_gnb_dev, y_dev, average="macro")
g_gnb_dev_rec = recall_score(g_gnb_dev, y_dev, average="macro")
print("Gaussian Naive Bayes using Glove accuracy: ", g_gnb_dev_acc)
print("Gaussian Naive Bayes using Glove precision: ", g_gnb_dev_pre)
print("Gaussian Naive Bayes using Glove recall: ", g_gnb_dev_rec)

In [None]:
g_gnb_test = g_gnb.predict(g_test)
savePrediction("g_gnb.csv", g_gnb_test)

<b> 3.4 Gaussian Naive Bayes on Word Count and Glove </b>

In [None]:
wg_gnb = GaussianNB()

wg_gnb.fit(wg_train, y_train)

In [None]:
wg_gnb_dev = wg_gnb.predict(wg_dev)

In [None]:
wg_gnb_dev_acc = accuracy_score(wg_gnb_dev, y_dev)
wg_gnb_dev_pre = precision_score(wg_gnb_dev, y_dev, average="macro")
wg_gnb_dev_rec = recall_score(wg_gnb_dev, y_dev, average="macro")
print("Gaussian Naive Bayes using Word Count and Glove accuracy: ", wg_gnb_dev_acc)
print("Gaussian Naive Bayes using Word Count and Glove precision: ", wg_gnb_dev_pre)
print("Gaussian Naive Bayes using Word Count and Glove recall: ", wg_gnb_dev_rec)

In [None]:
wg_gnb_test = wg_gnb.predict(wg_test)
savePrediction("wg_gnb.csv", wg_gnb_test)

<b> 3.5 Gaussian Naive Bayes on TF-IDF and Glove </b>

In [None]:
tg_gnb = GaussianNB()

tg_gnb.fit(tg_train, y_train)

In [None]:
tg_gnb_dev = tg_gnb.predict(tg_dev)

In [None]:
tg_gnb_dev_acc = accuracy_score(tg_gnb_dev, y_dev)
tg_gnb_dev_pre = precision_score(tg_gnb_dev, y_dev, average="macro")
tg_gnb_dev_rec = recall_score(tg_gnb_dev, y_dev, average="macro")
print("Gaussian Naive Bayes using TF-IDF and Glove accuracy: ", tg_gnb_dev_acc)
print("Gaussian Naive Bayes using TF-IDF and Glove precision: ", tg_gnb_dev_pre)
print("Gaussian Naive Bayes using TF-IDF and Glove recall: ", tg_gnb_dev_rec)

In [None]:
tg_gnb_test = tg_gnb.predict(tg_test)
savePrediction("tg_gnb.csv", tg_gnb_test)

# Parameter Tuning

Given the limitation in computing power, hyperparameter tuning will only be completed on three of the best performing models.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

<b> SVM </b>

In [None]:
svm = LinearSVC()

param_grid = [    
    {'C': [0.1, 1, 10],
   'max_iter' : [500,1000,2500],
   }
]

svm_search = GridSearchCV(svm, param_grid = param_grid, verbose=10, cv=3, error_score='raise')

<b> Tuned SVM on TF-IDF and Glove </b>

In [None]:
tg_svm_search = svm_search.fit(tg_train,y_train)

In [None]:
print('------ SVM on TF-IDF and Glove ------')
print('Best Score: ', tg_svm_search.best_score_)
print('Best Hyperparameters: ', tg_svm_search.best_params_)

In [None]:
print(pd.DataFrame(tg_svm_search.cv_results_))

From above, the best score comes when C is 0.1; max_iteration = 500.

In [None]:
tg_svm_tuned = LinearSVC(C=0.1,max_iter = 500)
tg_svm_tuned.fit(tg_train,y_train)

In [None]:
tg_svm_tuned_dev = tg_svm_tuned.predict(tg_dev)

In [None]:
tg_svm_tuned_dev_acc = accuracy_score(tg_svm_tuned_dev, y_dev)
tg_svm_tuned_dev_pre = precision_score(tg_svm_tuned_dev, y_dev, average="macro")
tg_svm_tuned_dev_rec = recall_score(tg_svm_tuned_dev, y_dev, average="macro")
print("Tuned SVM using TF-IDF and Glove accuracy: ", tg_svm_tuned_dev_acc)
print("Tuned SVM using TF-IDF and Glove precision: ", tg_svm_tuned_dev_pre)
print("Tuned SVM using TF-IDF and Glove recall: ", tg_svm_tuned_dev_rec)

In [None]:
tg_svm_tuned_test = tg_svm_tuned.predict(tg_test)

savePrediction("tg_svm_tuned.csv",tg_svm_tuned_test)

<b> Tuned SVM on Word Count and Glove </b>

In [None]:
wg_svm_search = svm_search.fit(wg_train,y_train)

In [None]:
print('------ SVM on Word Count and Glove ------')
print('Best Score: ', wg_svm_search.best_score_)
print('Best Hyperparameters: ', wg_svm_search.best_params_)

In [None]:
print(pd.DataFrame(wg_svm_search.cv_results_))

From above, the best score comes when C is 0.1; max_iteration = 500, 1000 and 2500 all produces the same score.

In [None]:
wg_svm_tuned = LinearSVC(C=0.1,max_iter = 1000)
wg_svm_tuned.fit(wg_train,y_train)

In [None]:
wg_svm_tuned_dev = wg_svm_tuned.predict(wg_dev)

In [None]:
wg_svm_tuned_dev_acc = accuracy_score(wg_svm_tuned_dev, y_dev)
wg_svm_tuned_dev_pre = precision_score(wg_svm_tuned_dev, y_dev, average="macro")
wg_svm_tuned_dev_rec = recall_score(wg_svm_tuned_dev, y_dev, average="macro")
print("Tuned SVM using Word Count and Glove accuracy: ", wg_svm_tuned_dev_acc)
print("Tuned SVM using Word Count and Glove precision: ", wg_svm_tuned_dev_pre)
print("Tuned SVM using Word Count and Glove recall: ", wg_svm_tuned_dev_rec)

In [None]:
wg_svm_tuned_test = wg_svm_tuned.predict(wg_test)

savePrediction("wg_svm_tuned.csv",wg_svm_tuned_test)

<b> Logistic Regression </b>

In [None]:
lr = LogisticRegression(solver="saga")

param_grid = [    
    {'C': [0.1, 1, 10],
    'max_iter' : [500,1000,2500],
    }
]

lr_search = GridSearchCV(lr, param_grid = param_grid, verbose=10, cv=3, error_score='raise')

wg_lr_search = lr_search.fit(wg_train,y_train)

In [None]:
print('------ Logistic Regression on Word Count and Glove ------')
print('Best Score: ', wg_lr_search.best_score_)
print('Best Hyperparameters: ', wg_lr_search.best_params_)
print("")

In [None]:
print(pd.DataFrame(wg_lr_search.cv_results_))

From above, the best score comes when C is 0.1; max_iteration = 2500

In [None]:
wg_lr_tuned = LogisticRegression(solver="saga",C=0.1,max_iter = 2500)
wg_lr_tuned.fit(wg_train,y_train)

In [None]:
wg_lr_tuned_dev = wg_lr_tuned.predict(wg_dev)

In [None]:
wg_lr_tuned_dev_acc = accuracy_score(wg_lr_tuned_dev, y_dev)
wg_lr_tuned_dev_pre = precision_score(wg_lr_tuned_dev, y_dev, average="macro")
wg_lr_tuned_dev_rec = recall_score(wg_lr_tuned_dev, y_dev, average="macro")
print("Tuned Logistic Regression using Word Count and Glove accuracy: ", wg_lr_tuned_dev_acc)
print("Tuned Logistic Regression using Word Count and Glove precision: ", wg_lr_tuned_dev_pre)
print("Tuned Logistic Regression using Word Count and Glove recall: ", wg_lr_tuned_dev_rec)

In [None]:
wg_lr_tuned_test = wg_lr_tuned.predict(wg_test)

savePrediction("wg_lr_tuned.csv",wg_lr_tuned_test)