# Sec 2: Building GloVe

## 2.1 Setup

We're going to run GloVe on the text of the Stanford Sentiment Treebank (SST) training set. Usually these methods are run on extremely large corpora, but we're using this here to make sure that you can train a reasonable model without waiting for hours or days. 

First, let's load the data as before. For our purposes, we won't need either the labels nor any of the test and dev data.

In [1]:
sst_home = '../data/trees'

import re

def load_sst_data(path):
    data = []
    with open(path) as f:
        for i, line in enumerate(f): 
            example = {}
            text = re.sub(r'\s*(\(\d)|(\))\s*', '', line)
            example['text'] = text[1:]
            data.append(example)
    return data
     
training_set = load_sst_data(sst_home + '/train.txt')

For speed, we're only using the 250 most common words. Extract cooccurence counts from the corpus,

In [2]:
import collections
import numpy as np

context_window = 4
top_k = 250

def tokenize(string):
    string = string.lower()
    return string.split()

word_counter = collections.Counter()
for example in training_set:
    word_counter.update(tokenize(example['text']))
vocabulary = [pair[0] for pair in word_counter.most_common()[0:top_k]]
index_to_word_map = dict(enumerate(vocabulary))
word_to_index_map = dict([(index_to_word_map[index], index) for index in index_to_word_map])

def extract_cooccurrences(dataset, word_map, amount_of_context=context_window):
    num_words = len(vocabulary)
    cooccurrences = np.zeros((num_words, num_words))
    nonzero_pairs = set()
    for example in dataset:
        words = tokenize(example['text'])
        for target_index in range(len(words)):
            target_word = words[target_index]
            if target_word not in word_to_index_map:
                continue
            target_word_index = word_to_index_map[target_word]
            min_context_index = max(0, target_index - amount_of_context)
            max_word = min(len(words), target_index + amount_of_context + 1)
            for context_index in list(range(min_context_index, target_index)) + \
            list(range(target_index + 1, max_word)):
                context_word = words[context_index]
                if context_word not in word_to_index_map:
                    continue
                context_word_index = word_to_index_map[context_word]
                cooccurrences[target_word_index][context_word_index] += 1.0
                nonzero_pairs.add((target_word_index, context_word_index))
    return cooccurrences, list(nonzero_pairs)
                
cooccurrences, nonzero_pairs = extract_cooccurrences(training_set, vocabulary)

### Batchify data

In [None]:
import random

def  batch_iter(nonzero_pairs, cooccurrences, batch_size):
    start = -1 * batch_size
    dataset_size = len(nonzero_pairs)
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        word_i = []
        word_j = []
        counts = []
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)
        batch_indices = order[start:start + batch_size]
        batch = [nonzero_pairs[index] for index in batch_indices]
        for k in batch:
            counts.append(cooccurrences[k])
            word_i.append(k[0])
            word_j.append(k[1])
        yield [counts, word_i, word_j]
        

## 2.2 Evalation metric

To be frank, a GloVe model trained on such a small dataset and vocabulary won't be spectacular, so we won't bother with a full-fledged similarity or analogy evaluation. Instead, we'll use the simple scoring function below, which grades the model on how well it captures ten easy/simple similarity comparisons. The function returns a score between 0 and 10. Random embeddings can be expected to get a score of 5.

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

def similarity(model, word_one, word_two):
    vec_one = model.get_embeddings(word_to_index_map[word_one]).reshape(1, -1)
    vec_two = model.get_embeddings(word_to_index_map[word_two]).reshape(1, -1)
    return float(cosine_similarity(vec_one, vec_two))

def score(model):
    m = model
    score = 0
    score += similarity(m, 'a', 'an') > similarity(m, 'a', 'documentary')
    score += similarity(m, 'in', 'of') > similarity(m, 'in', 'picture')
    score += similarity(m, 'action', 'thriller') >  similarity(m, 'action', 'end')
    score += similarity(m, 'films', 'movies') > similarity(m, 'films', 'good')
    score += similarity(m, 'film', 'movie') > similarity(m, 'film', 'movies')
    score += similarity(m, 'script', 'plot') > similarity(m, 'script', 'dialogue')
    score += similarity(m, 'character', 'human') > similarity(m, 'character', 'young')
    score += similarity(m, '``', "''") > similarity(m, '``', 'quite')
    score += similarity(m, 'funny', 'entertaining') > similarity(m, 'funny', 'while')
    score += similarity(m, 'good', 'great') > similarity(m, 'good', 'minutes')
    return score

Once you've built and trained the model, you can evaluate it by calling `score(model)`.

## 2.3 Implement and train

There's some starter code below for training a PyTorch model. **Fill it out to create an implementation of GloVe, then train it on the SST training set.**
Try not to modify any of the starter code.

### 2.3.1 Model

In [11]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

class Glove(nn.Module):
    def __init__(self, embedding_dim, vocab_size, batch_size):
        super(Glove, self).__init__()        
        self.word_embeddings = None
        
        """
        Your code goes here.
        """
            
    
    def forward():
        """
        And here.
        """       
        
        return 
        
        
    def init_weights():
        """
        And here.
        """
    
    def add_embeddings():
        """
        And here.
        
        Give W_emb = W + W^tilde
        """
        return self.word_embeddings
    
    def get_embeddings(self, index):
        if self.word_embeddings is None:
            add_embeddings()
        return self.word_embeddings[index, :]

### 2.3.2 Training Loop

In [12]:
def training_loop(batch_size, num_epochs, model, optim, data_iter, xmax, alpha):
    step = 0
    epoch = 0
    losses = []
    total_batches = int(len(training_set) / batch_size)
    while epoch <= num_epochs:
        model.train()
        counts, words, co_words = next(data_iter)        
        words_var = Variable(torch.LongTensor(words))
        co_words_var = Variable(torch.LongTensor(co_words))
        
        model.zero_grad()
        
        """
        Your code goes here.
        """
        
        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()
        
        if step % total_batches == 0:
            epoch += 1
            if epoch % 25 == 0:
                word_emebddings = model.add_embeddings()
                print( "Epoch:", (epoch), "Avg Loss:", np.mean(losses)/(total_batches*epoch), "Score:", score(model) )
        
        step += 1

### 2.3.3 Train a working model


You should use the following commands to train the model for at least 2000 steps. If your model works, it will converge to a score of 10 within that many steps. You need to complete the previous code block before you can run this!

In [None]:
embedding_dim = 10
vocab_size = len(vocabulary)
batch_size = 1024
learning_rate = 1.0
num_epochs = 2000
alpha = 0.75
xmax = 50

glove = Glove(embedding_dim, vocab_size, batch_size)
glove.init_weights()
optimizer = torch.optim.Adadelta(glove.parameters(), lr=learning_rate)
data_iter = batch_iter(nonzero_pairs, cooccurrences, batch_size)

training_loop(batch_size, num_epochs, glove, optimizer, data_iter, xmax, alpha)