# HW 1 entry
by Lara

In [1]:
import numpy as np
import pandas as pd
import scipy.spatial.distance as d

import os
DATA_HOME = os.path.join('data', 'wordrelatedness')

import vsm

In [2]:
# only needed to get the vocabulary!
gigawin5_df = pd.read_csv("data/vsmdata/giga_window5-scaled.csv.gz", index_col=0)

In [4]:
dev_df = pd.read_csv(
    os.path.join(DATA_HOME, "cs224u-wordrelatedness-dev.csv"))

In [10]:
# and load this one to make sure we have the entire vocabulary
test_df = pd.read_csv(
    os.path.join(DATA_HOME, "cs224u-wordrelatedness-test-unlabeled.csv"))

dev_vocab = list(set(dev_df.word1.values) | set(dev_df.word2.values))
test_vocab = list(set(test_df.word1.values) | set(test_df.word2.values))

In [6]:
# I use GloVe embeddings trained on 840B tokens (bigger training set that the GloVe embeddings included in the course data)
# there are words missing but none are in the dev or test set.

from torch_autoencoder import TorchAutoencoder
import torch.nn as nn

embeddings_dict = {}
# downloaded from https://nlp.stanford.edu/data/glove.840B.300d.zip
with open("data/glove.840B.300d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = ''.join(values[:-300])
        vector = np.asarray(values[-300:], "float32")
        embeddings_dict[word] = vector
glove_df = pd.DataFrame(embeddings_dict).transpose()

In [17]:
# must drop some words from our vocabulary because GloVe didn't tokenize the same way and doesn't have them
keep = gigawin5_df.drop(['):', ');', ':(', ':/', "america\'s", "aren\'t", "children\'s", "city\'s", "company\'s", "couldn\'t", 
                    "family\'s", "friend\'s", "hadn\'t",  '..', ':D',
                    "hasn\'t", "haven\'t", "he\'d", "he\'ll", "he\'s", "here\'s", "husband\'s", "i\'d", "i\'ll", "i\'m", 
                    "i\'ve", "isn\'t", "it\'ll", "let\'s", "mcdonald\'s", "men\'s", "mother\'s", "people\'s", "she\'s", 
                    "shouldn\'t", "son\'s", "sunday\'s", "there\'s", "they\'d", "they\'ll", "they\'re", "they\'ve", "today\'s", 
                    "wasn\'t", "we\'d", "we\'ll", "we\'re", "we\'ve", "weren\'t", "what\'s", "who\'s", "wife\'s", "women\'s", 
                    "won\'t", "world\'s", "would\'ve", "wouldn\'t", "year\'s", "you\'d", "you\'ve",
                     "can\'t", "didn\'t", "doesn\'t", "don\'t", "it\'s", "that\'s", "you\'ll", "you\'re"]).index

True True


In [21]:
# to regain most of the words that weren't in GloVe in our vocabulary:

mapping = {#'):', ');', ':(', ':/', 
           "aren\'t": 'arent', "couldn\'t": 'couldnt', "hadn\'t": 'hadnt',  "can\'t": 'cannot', 
                     #'..', ':D',
           "hasn\'t": 'hasnt', "haven\'t": 'havent', 
           "isn\'t": 'isnt', 
           "shouldn\'t": 'shouldnt', "they\'d": 'theyd', "they\'ll": 'theyll', "they\'re": 'theyre', "they\'ve": 'theyve', 
           "wasn\'t": 'wasnt',  
           "weren\'t": 'werent', 
           "would\'ve": 'wouldve', 
           "wouldn\'t": 'wouldnt', 
           "you\'d": 'youd', "you\'ve": 'youve',
           "didn\'t": 'didnt', "doesn\'t": 'doesnt', 
           "you\'ll": 'youll', "you\'re": 'youre'
}

combine_possessive = ["america\'s", "children\'s", "city\'s", "company\'s", "family\'s", "friend\'s",
                 "here\'s", "husband\'s", "mcdonald\'s", "men\'s", "mother\'s", "people\'s", "today\'s", 
                "son\'s", "sunday\'s", "wife\'s", "women\'s", "world\'s", "year\'s"]

combine_is = ["he\'s",  "it\'s", "she\'s",  "there\'s", "what\'s", "who\'s",  "that\'s"]

combine_generic = {"he\'d": ['he', 'would'], "he\'ll": ['he', 'will'],   "i\'d": ['i', 'would'], 
                   "i\'ll": ['i', 'will'], "i\'m": ['i', 'am'], "i\'ve": ['i', 'have'], 
                   "it\'ll": ['it', 'will'], "let\'s": ['let', 'us'], 
                   "we\'d": ['we', 'would'], "we\'ll": ['we', 'will'], "we\'re": ['we', 'are'],
                   "we\'ve": ['we', 'have'], "won\'t": ['will', 'not'], "don\'t": ['do', 'not']
}
emb_mapped = glove_df.loc[mapping.values()]
emb_mapped.index = mapping.keys()

emb_poss = glove_df.loc[[c.split("\'s")[0] for c in combine_possessive]] + glove_df.loc["'s"].values
emb_poss.index = combine_possessive

emb_is = glove_df.loc[[c.split("\'s")[0] for c in combine_is]] + glove_df.loc["is"].values
emb_is.index = combine_is

emb_comb = [glove_df.loc[word[0]] + glove_df.loc[word[1]] for word in combine_generic.values()]
emb_comb = pd.DataFrame(data=emb_comb, index=combine_generic.keys())

emb = pd.concat([glove_df.loc[keep], emb_mapped, emb_poss, emb_is, emb_comb], axis=0)

In [20]:
np.isin(dev_vocab, emb.index).all(), np.isin(test_vocab, emb.index).all()

(True, True)

In [22]:
# autoencode the dimension down to 250

x = emb.values
x = x/x.std(axis=0)
ae = TorchAutoencoder(hidden_dim=250, max_iter=10000, 
                      hidden_activation=nn.Sigmoid()).fit(x)
ae_df = pd.DataFrame(ae, index=emb.index)

Finished epoch 1 of 10000; error is 6.241227388381958

<generator object Module.parameters at 0x175ae2d60>


Stopping after epoch 1053. Training loss did not improve more than tol=1e-05. Final error is 0.16253660805523396.

In [23]:
# and evaluate:
df, rho = vsm.word_relatedness_evaluation(dev_df, ae_df, distfunc=d.correlation)
print(rho)

0.7736322316655473


In [19]:
# in hindsight, the lost words were never missed, and it's worth seeing if the model does better still without them
print(np.isin(dev_vocab, keep).all(), np.isin(test_vocab, keep).all())
emb = glove_df.loc[keep]

In [24]:
x = emb.values
x = x/x.std(axis=0)
ae = TorchAutoencoder(hidden_dim=250, max_iter=10000, 
                      hidden_activation=nn.Sigmoid()).fit(x)
ae_df = pd.DataFrame(ae, index=emb.index)

Finished epoch 1 of 10000; error is 6.234529912471771

<generator object Module.parameters at 0x152e30f90>


Stopping after epoch 1069. Training loss did not improve more than tol=1e-05. Final error is 0.16211374662816525.

In [25]:
df, rho = vsm.word_relatedness_evaluation(dev_df, ae_df, distfunc=d.correlation)
print(rho)

0.7739386387717364


In [None]:
# more or less the same, phew