## Set 5
## 3. Word2Vec **Principles**

#### Preparation


The following function may be useful for loading the necessary data.

In [2]:
import requests

url_dict = {
    'dr_seuss.txt': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/set5/data/dr_seuss.txt',
    'P3CHelpers.py': 'https://caltech-cs155.s3.us-east-2.amazonaws.com/sets/set5/code/P3CHelpers.py'
}

def download_file(file_path):
    url = url_dict[file_path]
    print('Start downloading...')
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024 * 1024):
                f.write(chunk)
    print('Complete')

In [3]:
# Download files
download_file('dr_seuss.txt')
download_file('P3CHelpers.py')

Start downloading...
Complete
Start downloading...
Complete


In [4]:
import numpy as np
from P3CHelpers import *
import torch
import torch.nn as nn
import torch.optim as optim

#### Problem D:
Fill in the generate_traindata and find_most_similar_pairs functions.

In [5]:
def get_word_repr(word_to_index, word):
    """
    Returns one-hot-encoded feature representation of the specified word given
    a dictionary mapping words to their one-hot-encoded index.

    Arguments:
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        word:          Word whose feature representation we wish to compute.

    Returns:
        feature_representation:     Feature representation of the passed-in word.
    """
    unique_words = word_to_index.keys()
    # Return a vector that's zero everywhere besides the index corresponding to <word>
    feature_representation = np.zeros(len(unique_words))
    feature_representation[word_to_index[word]] = 1
    return feature_representation

def generate_traindata(word_list, word_to_index, window_size=4):
    """
    Generates training data for Skipgram model.

    Arguments:
        word_list:     Sequential list of words (strings).
        word_to_index: Dictionary mapping words to their corresponding index
                       in a one-hot-encoded representation of our corpus.

        window_size:   Size of Skipgram window. Defaults to 2
                       (use the default value when running your code).

    Returns:
        (trainX, trainY):     A pair of matrices (trainX, trainY) containing training
                              points (one-hot-encoded vectors) and their corresponding output_word
                              (also one-hot-encoded vectors)

    """
    trainX = []
    trainY = []
    for i, word in enumerate(word_list):
      feature_rep = get_word_repr(word_to_index, word)

      window = [i+j for j in range(-window_size, window_size+1) if (i+j >= 0) and (i+j < len(word_list)) and (j != 0)]

      for j in window:
          context = word_list[j]
          context_feature = get_word_repr(word_to_index, context)

          trainX.append(feature_rep)
          trainY.append(context_feature)

    return np.array(trainX), np.array(trainY)

In [6]:
from typing import Sized
def find_most_similar_pairs(filename, num_latent_factors):
    """
    Find the most similar pairs from the word embeddings computed from
    a body of text

    Arguments:
        filename:           Text file to read and train embeddings from
        num_latent_factors: The number of latent factors / the size of the embedding
    """
    # Load in a list of words from the specified file; remove non-alphanumeric characters
    # and make all chars lowercase.
    sample_text = load_word_list(filename)

    # Create word dictionary
    word_to_index = generate_onehot_dict(sample_text)
    print("Textfile contains %s unique words"%len(word_to_index))
    # Create training data
    trainX, trainY = generate_traindata(sample_text, word_to_index)
    # vocab_size = number of unique words in our text file. Will be useful
    # when adding layers to your neural network
    vocab_size = len(word_to_index)
    num_epochs = 10

    model = nn.Sequential(
        nn.Linear(in_features=vocab_size,
                  out_features=num_latent_factors),
        nn.Linear(in_features=num_latent_factors,
                  out_features=vocab_size),
    )

    optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
    #CrossEntropyLoss implicitly applies softmax to its inputs
    loss_fn = nn.CrossEntropyLoss()
    model.train()

    data_train = torch.from_numpy(np.array([trainX , trainY])).float()
    train_loader = torch.utils.data.DataLoader(data_train, batch_size=256, shuffle=True)

    for epoch in range(num_epochs):
      for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = loss_fn(output, target)
        loss.backward()
        optimizer.step()

    model.eval()

    for i, p in enumerate(model.parameters()):
      p.requires_grad = False
      print(list(model.parameters())[i].size())

    weights = list(model.parameters())[2]

    # Find and print most similar pairs
    similar_pairs = most_similar_pairs(weights, word_to_index)
    for pair in similar_pairs[:30]:
        print(pair)

### Problem E-H:
Run your model on drseuss.txt and answer questions from E through H.

In [7]:
find_most_similar_pairs('dr_seuss.txt', 10)

Textfile contains 308 unique words
torch.Size([10, 308])
torch.Size([10])
torch.Size([308, 10])
torch.Size([308])
Pair(in, made), Similarity: 0.958769
Pair(made, in), Similarity: 0.958769
Pair(some, go), Similarity: 0.91432124
Pair(go, some), Similarity: 0.91432124
Pair(clark, time), Similarity: 0.9114461
Pair(time, clark), Similarity: 0.9114461
Pair(saw, some), Similarity: 0.90133107
Pair(kind, make), Similarity: 0.9003661
Pair(make, kind), Similarity: 0.9003661
Pair(yell, play), Similarity: 0.89308
Pair(play, yell), Similarity: 0.89308
Pair(two, are), Similarity: 0.8898352
Pair(are, two), Similarity: 0.8898352
Pair(you, or), Similarity: 0.88949037
Pair(or, you), Similarity: 0.88949037
Pair(eleven, kite), Similarity: 0.8882519
Pair(kite, eleven), Similarity: 0.8882519
Pair(with, gump), Similarity: 0.8843775
Pair(gump, with), Similarity: 0.8843775
Pair(mouse, hair), Similarity: 0.881595
Pair(hair, mouse), Similarity: 0.881595
Pair(bird, put), Similarity: 0.88140696
Pair(put, bird), Sim