### Demonstrates how to train the word2vec embeddings based on Project Gutenberg books;

In [1]:
import numpy as np
import torch
from torch import nn
import word2vec_ralated as w2v
import get_data as gd
import math
import pickle
torch.manual_seed(42)

<torch._C.Generator at 0x7f930808c7b0>

### Get `torch.utils.data.DataLoader` and `vocab.Vocab` objects;

In [2]:
loader, vocab = gd.get_iter_and_vocab("gutenberg_books.txt", num_books=2)

A file with that name already exists, if truncate is True I will overwrite it. Continue [y/n]:n


In [3]:
for centers, contexts_and_negatives, coefficients, mask_pads in loader:
    print(centers.shape)
    print(contexts_and_negatives.shape)
    print(coefficients.shape)
    print(mask_pads.shape)
    break

torch.Size([512, 1])
torch.Size([512, 48])
torch.Size([512, 48])
torch.Size([512, 48])


### Set up the model;

In [4]:
print(loader.dataset[0])

(156, [1306, 582], [57, 15, 17, 2957, 12031, 2885, 3956, 2976, 1388, 4])


In [5]:
class EmbeddingsModel(nn.Module):
    def __init__(self, vocab_size, embed_size, **kwargs):
        super(EmbeddingsModel, self).__init__(**kwargs)
        self.embed_center = nn.Embedding(vocab_size, embed_size)
        self.embed_context = nn.Embedding(vocab_size, embed_size)
    
    def forward(self, centers, contexts_and_negatives, coefficients):
        V = self.embed_center(centers)
        U = self.embed_context(contexts_and_negatives)
        return torch.bmm(V, U.permute(0, 2, 1)) * coefficients.unsqueeze(1)

In [6]:
vocab_size, embed_size = len(vocab), 100
model = EmbeddingsModel(vocab_size, embed_size)

In [7]:
def log_sigmoid(x):
    return torch.log(1 / (1 + torch.exp(- x)))

In [8]:
class EmbeddingLoss(nn.Module):
    def __init__(self):
        super(EmbeddingLoss, self).__init__()
    
    def forward(self, prods, mask_pads):
        return - (log_sigmoid(prods) * mask_pads.unsqueeze(1)).sum()

In [9]:
loss_fn = EmbeddingLoss()

In [10]:
optim = torch.optim.Adam(model.parameters(), lr=5e-4)

In [11]:
def train_loop(model, loss_fn, optim, loader):
    model.train()
    fifth = max(1, len(loader) // 5)
    batches = len(loader)
    tot_loss = 0.
    for batch, (centers, contexts_and_negatives, coefficients, 
                mask_pads) in enumerate(loader, start=1):
        prods = model(centers, contexts_and_negatives, coefficients)
        loss = loss_fn(prods, mask_pads)
        if batch % fifth == 0:
            print(f"train_loss: {loss.item():.5f}\tprogress: {batch}/{batches}")
        tot_loss += loss.item()
        optim.zero_grad()
        loss.backward()
        optim.step()

In [12]:
for t in range(50):
    print(f"Epoch {t+1}:\n-------------------------")
    train_loop(model, loss_fn, optim, loader)
    print('\n')

Epoch 1:
-------------------------


  Variable._execution_engine.run_backward(


train_loss: 38919.75391	progress: 13/68
train_loss: 38238.70312	progress: 26/68
train_loss: 39641.30078	progress: 39/68
train_loss: 40672.30469	progress: 52/68
train_loss: 39984.49609	progress: 65/68


Epoch 2:
-------------------------
train_loss: 39401.93750	progress: 13/68
train_loss: 39378.27344	progress: 26/68
train_loss: 40067.01172	progress: 39/68
train_loss: 38317.08594	progress: 52/68
train_loss: 38314.89453	progress: 65/68


Epoch 3:
-------------------------
train_loss: 37429.77344	progress: 13/68
train_loss: 38819.75000	progress: 26/68
train_loss: 36880.53516	progress: 39/68
train_loss: 36576.88672	progress: 52/68
train_loss: 37050.13672	progress: 65/68


Epoch 4:
-------------------------
train_loss: 36849.45703	progress: 13/68
train_loss: 38280.48438	progress: 26/68
train_loss: 39131.79297	progress: 39/68
train_loss: 37650.18359	progress: 52/68
train_loss: 39036.22656	progress: 65/68


Epoch 5:
-------------------------
train_loss: 34131.85938	progress: 13/68
train_loss: 

train_loss: 18478.85352	progress: 52/68
train_loss: 19124.76172	progress: 65/68


Epoch 36:
-------------------------
train_loss: 17832.91797	progress: 13/68
train_loss: 19020.40430	progress: 26/68
train_loss: 17453.09375	progress: 39/68
train_loss: 18083.60352	progress: 52/68
train_loss: 18200.64453	progress: 65/68


Epoch 37:
-------------------------
train_loss: 17701.82617	progress: 13/68
train_loss: 17023.02734	progress: 26/68
train_loss: 16636.59961	progress: 39/68
train_loss: 17788.19531	progress: 52/68
train_loss: 17313.57812	progress: 65/68


Epoch 38:
-------------------------
train_loss: 17060.52344	progress: 13/68
train_loss: 17634.94922	progress: 26/68
train_loss: 17503.92383	progress: 39/68
train_loss: 16613.54102	progress: 52/68
train_loss: 16418.41406	progress: 65/68


Epoch 39:
-------------------------
train_loss: 16413.30664	progress: 13/68
train_loss: 16686.37305	progress: 26/68
train_loss: 15735.02734	progress: 39/68
train_loss: 16218.89746	progress: 52/68
train_lo

In [14]:
# np.save("embeds_v1", model.embed_center.weight.data.numpy())