# Pretrained Embedding

based on GloVe (https://nlp.stanford.edu/projects/glove/)

In [None]:
import numpy as np
import torch
import torch.nn as nn
import bcolz
import pickle
import os

glove_path = './glove/'

load the dataset, create a dictionary to index the words and list for the vector representation and safe them to disk for future uses.
You need to download the dataset first from stanford (link above)

In [None]:
if not (os.path.isfile(f'{glove_path}6B.50_words.pkl') or os.path.isfile(f'{glove_path}6B.50_words.pkl')):
    words = []
    idx = 0
    word2idx = {}
    vectors = bcolz.carray(np.zeros(1), rootdir=f'{glove_path}/6B.50.dat', mode='w')

    with open(f'{glove_path}glove.6B.50d.txt', 'rb') as f:
        for l in f:
            line = l.decode().split()
            word = line[0]
            words.append(word)
            word2idx[word] = idx
            idx += 1
            vect = np.array(line[1:]).astype(np.float)
            vectors.append(vect)
        
    vectors = bcolz.carray(vectors[1:].reshape((400000, 50)), rootdir=f'{glove_path}/6B.50.dat', mode='w')
    vectors.flush()
    pickle.dump(words, open(f'{glove_path}6B.50_words.pkl', 'wb'))
    pickle.dump(word2idx, open(f'{glove_path}6B.50_idx.pkl', 'wb'))

load the the representations and create the glove dictionary to map words to their representations

In [None]:
vectors = bcolz.open(f'{glove_path}6B.50.dat')[:]
words = pickle.load(open(f'{glove_path}6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'{glove_path}6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [None]:
print(glove['politician'])

In [None]:
def glove_weights_matrix(target_vocab):
    """needs target_vocab as a list of words"""
    matrix_len = len(target_vocab)
    emb_dim = 50
    weights_matrix = np.zeros((matrix_len, emb_dim))
    words_found = 0
    unknown_words = 0

    for i, word in enumerate(target_vocab):
        try: 
            weights_matrix[i] = glove[word]
            words_found += 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))
            unknown_words += 1

    print(f'{unknown_words} unknown words where added as random vector')
    return weights_matrix