# Glove feature vectors

instead of training a model on one hot encoded feature vectors, we can encode the answers of the model in a feature vector using the GloVe dataset.

To start first load a glove model and store in the `../data/glove/...txt` file.

In [10]:
import pickle
import numpy as np
import pandas as pd
import csv
from tqdm import tqdm
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
with open("../data/glove/glove.42B.300d.txt", "rb") as f:
    words = pd.read_table(f, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

In [3]:
with open("../data/probed/train_filtered.pkl", "rb") as f:
    _, y_train = pickle.load(f)

with open("../data/probed/validation_filtered.pkl", "rb") as f:
    _, y_validation = pickle.load(f)

with open("../data/probed/train_raw_answers.pkl", "rb") as f:
    train_raw_answers = pickle.load(f)

with open("../data/probed/validation_raw_answers.pkl", "rb") as f:
    validation_raw_answers  = pickle.load(f)

In [4]:
def vec(w):
    return words.loc[w].values

In [5]:
def create_glove_vector(inp):
    result = []
    for row in tqdm(inp):
        vector = [] 
        for answer in row:
            answer_str = answer[1]
            try:
                vector.append(vec(answer_str))
            except KeyError:
                vector.append(np.zeros(300))

        result.append(np.concatenate(vector))
    return result

data = create_glove_vector(train_raw_answers)

100%|██████████| 9505/9505 [33:14<00:00,  4.77it/s]


In [6]:
data_vector = np.array(data)

In [7]:
classifier = MLPClassifier(learning_rate='adaptive', 
                           max_iter=10000, hidden_layer_sizes=(100, 30))
classifier.fit(data_vector, y_train)

MLPClassifier(hidden_layer_sizes=(100, 30), learning_rate='adaptive',
              max_iter=10000)

In [8]:
data_valid = create_glove_vector(validation_raw_answers)
pred = classifier.predict(data_valid)

100%|██████████| 2716/2716 [09:13<00:00,  4.90it/s]


In [9]:
print(classification_report(y_validation, pred, zero_division=0))

               precision    recall  f1-score   support

     anecdote       0.40      0.29      0.34       458
   assumption       0.75      0.89      0.81      1826
common-ground       0.00      0.00      0.00        66
        other       0.20      0.07      0.10        30
   statistics       0.33      0.24      0.28        66
    testimony       0.59      0.35      0.44       219
        title       0.63      0.33      0.44        51

     accuracy                           0.69      2716
    macro avg       0.41      0.31      0.34      2716
 weighted avg       0.64      0.69      0.65      2716



In [11]:
classifier_svc = SVC()
classifier_svc.fit(data_vector, y_train)
pred = classifier_svc.predict(data_valid)
print(classification_report(y_validation, pred, zero_division=0))

               precision    recall  f1-score   support

     anecdote       0.49      0.14      0.22       458
   assumption       0.71      0.96      0.82      1826
common-ground       0.00      0.00      0.00        66
        other       0.00      0.00      0.00        30
   statistics       0.67      0.03      0.06        66
    testimony       0.80      0.33      0.47       219
        title       1.00      0.33      0.50        51

     accuracy                           0.71      2716
    macro avg       0.52      0.26      0.29      2716
 weighted avg       0.66      0.71      0.64      2716

