# Setup

In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import csv
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from statistics import mean

# Utils

In [None]:
def get_word_embedding(w):
    try:
        e = keyed_vector[w] #.wv
    except KeyError:
        return None
    return e

In [None]:
def get_sentence_embedding(word_embeddings):
    filtered = [arr for arr in word_embeddings if arr is not None]
    if filtered:
        return np.mean(filtered, axis=0)
    else:
        return -np.inf #raise ValueError("All the elements of the list are None")

In [None]:
get_embeddings = lambda x: [get_word_embedding(word) for word in x.split()]

# Dataset import

In [None]:
train = pd.read_csv("../dataset/prep_train.csv")
val = pd.read_csv("../dataset/prep_valid.csv")
test = pd.read_csv("../dataset/prep_test.csv")

# Unigrams

In [None]:
unigrams_corpus = train.STATEMENT.apply(lambda x: x.split()).tolist()
keyed_vector = Word2Vec(unigrams_corpus, min_count=1, workers=4, vector_size=300)

In [None]:
X_train = train.STATEMENT.apply(get_embeddings)
X_train = [get_sentence_embedding(x) for x in X_train]
X_val = val.STATEMENT.apply(get_embeddings)
X_val = [get_sentence_embedding(x) for x in X_val]
Y_train = train.LABEL
Y_val = val.LABEL
X_train, Y_train = zip(*[(x, y) for x, y in zip(X_train, Y_train) if x is not None])
X_val, Y_val = zip(*[(x, y) for x, y in zip(X_val, Y_val) if x is not None])

## SVM

In [None]:
kernel = ["linear", "poly", "rbf", "sigmoid"]
n_iterations = 5

In [None]:
with open('unigrams_svm.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for k in kernel:
        accs = []
        for i in range(n_iterations):
            clf = make_pipeline(StandardScaler(), SVC(kernel=k))
            clf.fit(X_train, Y_train)
            accs.append(clf.score(X_val, Y_val))
        acc = mean(accs)
        print(f"{k}, {acc}")
        data = [k, acc]
        writer.writerow(data)

# Random Forest

In [None]:
n_estimators = [50, 100, 150, 200, 250]
n_iterations = 5

In [None]:
with open('unigrams_rf.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for n in n_estimators:
        accs = []
        for i in range(n_iterations):
            clf = RandomForestClassifier(max_depth=None, n_estimators=n)
            clf.fit(X_train, Y_train)
            accs.append(clf.score(X_val, Y_val))
        acc = mean(accs)
        print(f"{n}, {acc}")
        data = [n, acc]
        writer.writerow(data)

# Google News

In [None]:
keyed_vector = KeyedVectors.load_word2vec_format('PreTrainedCorpus/GoogleNews/GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
X_train = train.STATEMENT.apply(get_embeddings)
X_train = [get_sentence_embedding(x) for x in X_train]
X_val = val.STATEMENT.apply(get_embeddings)
X_val = [get_sentence_embedding(x) for x in X_val]
Y_train = train.LABEL
Y_val = val.LABEL
X_train, Y_train = zip(*[(x, y) for x, y in zip(X_train, Y_train) if x is not None])
X_val, Y_val = zip(*[(x, y) for x, y in zip(X_val, Y_val) if x is not None])

## SVM

In [None]:
kernel = ["linear", "poly", "rbf", "sigmoid"]
n_iterations = 5

In [None]:
with open('svm.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for k in kernel:
        accs = []
        for i in range(n_iterations):
            clf = make_pipeline(StandardScaler(), SVC(kernel=k))
            clf.fit(X_train, Y_train)
            accs.append(clf.score(X_val, Y_val))
        acc = mean(accs)
        print(f"{k}, {acc}")
        data = [k, acc]
        writer.writerow(data)

## Random Forest

In [None]:
n_estimators = [250, 300, 400]
n_iterations = 5

In [None]:
with open('rf2.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for n in n_estimators:
        accs = []
        for i in range(n_iterations):
            clf = RandomForestClassifier(max_depth=None, n_estimators=n)
            clf.fit(X_train, Y_train)
            accs.append(clf.score(X_val, Y_val))
        acc = mean(accs)
        print(f"{n}, {acc}")
        data = [n, acc]
        writer.writerow(data)

# Glove

In [None]:
keyed_vector = KeyedVectors.load_word2vec_format('PreTrainedCorpus/Glove/glove.840B.300d.txt', binary=False, no_header=True)

In [None]:
X_train = train.STATEMENT.apply(get_embeddings)
X_train = [get_sentence_embedding(x) for x in X_train]
X_val = val.STATEMENT.apply(get_embeddings)
X_val = [get_sentence_embedding(x) for x in X_val]
Y_train = train.LABEL
Y_val = val.LABEL
X_train, Y_train = zip(*[(x, y) for x, y in zip(X_train, Y_train) if x is not None])
X_val, Y_val = zip(*[(x, y) for x, y in zip(X_val, Y_val) if x is not None])

# SVM

In [None]:
kernel = ["linear", "poly", "rbf", "sigmoid"]
n_iterations = 5

In [None]:
with open('glove_svm.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for k in kernel:
        accs = []
        for i in range(n_iterations):
            clf = make_pipeline(StandardScaler(), SVC(kernel=k))
            clf.fit(X_train, Y_train)
            accs.append(clf.score(X_val, Y_val))
        acc = mean(accs)
        print(f"{k}, {acc}")
        data = [k, acc]
        writer.writerow(data)

# Random Forest

In [None]:
n_estimators = [50, 100, 150, 200, 250]
n_iterations = 5

In [None]:
with open('glove_rf.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for n in n_estimators:
        accs = []
        for i in range(n_iterations):
            clf = RandomForestClassifier(max_depth=None, n_estimators=n)
            clf.fit(X_train, Y_train)
            accs.append(clf.score(X_val, Y_val))
        acc = mean(accs)
        print(f"{n}, {acc}")
        data = [n, acc]
        writer.writerow(data)