# Word2Vec with PyTorch

**Places to Look:**
- [Original Paper](https://arxiv.org/abs/1301.3781)
- [Medium Article 1](https://medium.com/@bijil.subhash/code-walkthrough-of-word2vec-pytorch-implementation-3a9ca0ad55a7)
- [Medium Article 2](https://towardsdatascience.com/word2vec-with-pytorch-implementing-original-paper-2cd7040120b0)
- [Github Repo](https://github.com/OlgaChernytska/word2vec-pytorch)


## NOTES TO SELF:

- Should have a discussion about the difficulty of handling unseen words in
  novel situations. Find research papers discussing this.



## OUTLINE OF FINAL DOC:

- Introduction
- Data
- Data Preparation
- Building the Network
  - Architecture & Hyperparameters
- Validating the Performance
- References

In [12]:
import json
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from functools import partial
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


In [13]:
# NOTE: If running this locally in an environment that has not yet setup nltk,
# need to run this code here.
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

In [14]:

# Step 0: How big is this dataset? How big is this corpus?

data = pd.read_csv('./wiki_movie_plots_deduped.csv')

data.shape, data.columns

((34886, 8),
 Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
        'Genre', 'Wiki Page', 'Plot'],
       dtype='object'))

## PyTorch Pre-Processor / Tokenizer

In [15]:
train_mask = np.random.rand(len(data)) < 0.8
train_data = data[train_mask]["Plot"]
val_data = data[~train_mask]["Plot"]


In [16]:
CBOW_N_WORDS = 4
MAX_SEQUENCE_LENGTH = 256

def collate_cbow(batch, text_pipeline):
    batch_input = []
    batch_output = []

    for text in batch:
        text_tokens_ids = text_pipeline(text)

        if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + CBOW_N_WORDS * 2 + 1)]
            # Pop the word in the middle of the sequence.
            output = token_id_sequence.pop(CBOW_N_WORDS)
            input_ = token_id_sequence
            batch_input.append(input_)
            batch_output.append(output)

    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)

    return batch_input, batch_output



In [17]:
def build_dataloader(data_iter, batch_size):
    MIN_WORD_FREQUENCY=1

    tokenizer = get_tokenizer("basic_english", language="en")

    vocab = build_vocab_from_iterator(
        map(tokenizer, data_iter),
        specials=["<unk>"],
        min_freq=MIN_WORD_FREQUENCY
    )

    vocab.set_default_index(vocab["<unk>"])

    def text_pipeline(x):
        return vocab(tokenizer(x))

    dataloader = DataLoader(
        data_iter,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(collate_cbow, text_pipeline=text_pipeline)
    )

    return dataloader, vocab



In [18]:

EMBED_MAX_NORM = 1

class CBOWModel(nn.Module):
    def __init__(self, vocab_size: int, embedding_dim: int):
        super(CBOWModel, self).__init__()

        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            max_norm=EMBED_MAX_NORM
        )

        self.linear = nn.Linear(
            in_features=embedding_dim,
            out_features=vocab_size
        )


    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x



In [19]:
BATCH_SIZE=20

train_dataloader, vocab = build_dataloader(data_iter=train_data.values, batch_size=BATCH_SIZE)

val_dataloader, _ = build_dataloader(data_iter=val_data.values, batch_size=BATCH_SIZE)


In [22]:

# Code taken from here:
# https://github.com/OlgaChernytska/word2vec-pytorch/blob/main/utils/trainer.py

class Trainer:

    def __init__(
        self,
        model,
        epochs,
        train_dataloader,
        train_steps,
        val_dataloader,
        val_steps,
        criterion,
        optimizer,
        lr_scheduler
    ):
        self.model = model
        self.epochs = epochs
        self.train_dataloader = train_dataloader
        self.train_steps = train_steps
        self.val_dataloader = val_dataloader
        self.val_steps = val_steps
        self.criterion = criterion
        self.optimizer = optimizer
        self.lr_scheduler = lr_scheduler

        self.loss = {"train": [], "val": []}


    def train(self):
        for epoch in range(self.epochs):
            self._train_epoch()
            self._validate_epoch()

            print(
                "Epoch: {}/{}, Train Loss={:.5f}, Val Loss={:.5f}".format(
                    epoch + 1,
                    self.epochs,
                    self.loss["train"][-1],
                    self.loss["val"][-1]
                )
            )

            self.lr_scheduler.step()

    
    def _train_epoch(self):
        self.model.train()

        running_loss = []

        for i, batch_data in enumerate(self.train_dataloader, 1):
            print("STARTING BATCH:", i)
            inputs = batch_data[0]
            labels = batch_data[1]

            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()

            running_loss.append(loss.item())

            if i == self.train_steps:
                break

        epoch_loss = np.mean(running_loss)
        self.loss["train"].append(epoch_loss)

    def _validate_epoch(self):
        self.model.eval()
        running_loss = []

        with torch.no_grad():
            for i, batch_data in enumerate(self.val_dataloader, 1):
                inputs = batch_data[0]
                labels = batch_data[1]

                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

                running_loss.append(loss.item())

                if i == self.val_steps:
                    break

        epoch_loss = np.mean(running_loss)
        self.loss["val"].append(epoch_loss)
        

In [23]:
EPOCHS=2
LR=0.01

model = CBOWModel(vocab_size=len(vocab), embedding_dim=10)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

lr_lambda = lambda epoch: (EPOCHS - epoch) / EPOCHS
lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda, verbose=True)

trainer = Trainer(
    model=model,
    epochs=EPOCHS,
    train_dataloader=train_dataloader,
    train_steps=10,
    val_dataloader=val_dataloader,
    val_steps=10,
    criterion=criterion,
    optimizer=optimizer,
    lr_scheduler=lr_scheduler
)

trainer.train()
print("Training finished")


    

Adjusting learning rate of group 0 to 1.0000e-02.
STARTING BATCH: 1
STARTING BATCH: 2
STARTING BATCH: 3
STARTING BATCH: 4
STARTING BATCH: 5
STARTING BATCH: 6
STARTING BATCH: 7
STARTING BATCH: 8
STARTING BATCH: 9
STARTING BATCH: 10
Epoch: 1/2, Train Loss=11.86478, Val Loss=11.66135
Adjusting learning rate of group 0 to 5.0000e-03.
STARTING BATCH: 1
STARTING BATCH: 2
STARTING BATCH: 3
STARTING BATCH: 4
STARTING BATCH: 5
STARTING BATCH: 6
STARTING BATCH: 7
STARTING BATCH: 8
STARTING BATCH: 9
STARTING BATCH: 10
Epoch: 2/2, Train Loss=11.68298, Val Loss=11.41915
Adjusting learning rate of group 0 to 0.0000e+00.
Training finished
