# Word2Vec with PyTorch

**Places to Look:**
- [Original Paper](https://arxiv.org/abs/1301.3781)
- [Medium Article 1](https://medium.com/@bijil.subhash/code-walkthrough-of-word2vec-pytorch-implementation-3a9ca0ad55a7)
- [Medium Article 2](https://towardsdatascience.com/word2vec-with-pytorch-implementing-original-paper-2cd7040120b0)


## NOTES TO SELF:

- Should have a discussion about the difficulty of handling unseen words in
  novel situations. Find research papers discussing this.



## OUTLINE OF FINAL DOC:

- Introduction
- Data
- Data Preparation
- Building the Network
  - Architecture & Hyperparameters
- Validating the Performance
- References

In [41]:
import json
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from functools import partial
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


In [2]:
# NOTE: If running this locally in an environment that has not yet setup nltk,
# need to run this code here.
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

In [3]:

# Step 0: How big is this dataset? How big is this corpus?

data = pd.read_csv('./wiki_movie_plots_deduped.csv')

data.shape, data.columns

((34886, 8),
 Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
        'Genre', 'Wiki Page', 'Plot'],
       dtype='object'))

## NLTK Pre-Processing / Tokenizer

In [4]:
# import re

# Taken from here:
# https://www.regular-expressions.info/floatingpoint.html
# FLOATING_POINT_REGEX = "(^[-+]?[0-9]*\.?[0-9]+$)|(^[-+]?[0-9]*\.?$)"
# stops = set(stopwords.words('english'))

# def preprocess_token(token):
#     return token.lower()


# def is_valid_token(token):
#     return token not in stops and re.match(FLOATING_POINT_REGEX, token) is None

# word_tokenize(data['Plot'].iloc[:5])

# corpus = set()
# tokenized = []

# for sentence in data['Plot'].head(100):
#     tokenized_sentence = [preprocess_token(token) for token in word_tokenize(sentence.replace("-", " ")) if is_valid_token(token)]

#     corpus.update(tokenized_sentence)
#     tokenized.append(tokenized_sentence)




## PyTorch Pre-Processor / Tokenizer

In [5]:
MIN_WORD_FREQUENCY=10

tokenizer = get_tokenizer("basic_english", language="en")

vocab = build_vocab_from_iterator(
    map(tokenizer, data["Plot"]),
    specials=["<unk>"],
    min_freq=MIN_WORD_FREQUENCY
)

vocab.set_default_index(vocab["<unk>"])

f"Corpus Size: {len(vocab)}"

'Corpus Size: 40878'

In [6]:
train_mask = np.random.rand(len(data)) < 0.8
train_data = data[train_mask]["Plot"]
val_data = data[~train_mask]["Plot"]


In [7]:
def text_pipeline(sentence):
    return vocab(tokenizer(sentence))



In [35]:
# import math

# text_pipeline("Hello World, my name is flalalala")

# N = 100000
# LOG_N = 10
# BATCH_SIZE=50

# first_n = data["Plot"].head(N)
# batch_count = math.ceil(len(first_n) / BATCH_SIZE)

# for i in range(batch_count):
#     start = i * BATCH_SIZE
#     end = start + BATCH_SIZE
#     batch = first_n.iloc[start:end]
#     input_, output = collate_cbow(batch, text_pipeline)

#     if i % LOG_N == 0:
#         print(f"[{i+1}/{batch_count}]: {input_.shape}, {output.shape}")


In [36]:
# Some statistics on our sentence / tokenized words.

# What is the longest plot, in tokens?
f"Corpus Size == {len(vocab)}"

'Corpus Size == 40878'

In [37]:
CBOW_N_WORDS = 4
MAX_SEQUENCE_LENGTH = 256

def collate_cbow(batch, text_pipeline):
    batch_input = []
    batch_output = []

    for text in batch:
        text_tokens_ids = text_pipeline(text)

        if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + CBOW_N_WORDS * 2 + 1)]
            # Pop the word in the middle of the sequence.
            output = token_id_sequence.pop(CBOW_N_WORDS)
            input_ = token_id_sequence
            batch_input.append(input_)
            batch_output.append(output)

    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)

    return batch_input, batch_output



In [38]:
BATCH_SIZE=20

train_dataloader = DataLoader(
    train_data,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=partial(collate_cbow, text_pipeline=text_pipeline),
)

valid_dataloader = DataLoader(
    val_data,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=partial(collate_cbow, text_pipeline=text_pipeline),
)


In [39]:

EMBED_DIMENSION = 300
EMBED_MAX_NORM = 1

class CBOWModel(nn.Module):
    def __init__(self, vocab_size: int):
        super(CBOWModel, self).__init__()

        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
            max_norm=EMBED_MAX_NORM
        )

        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size
        )


    def forward(self, inputs_):
        x = self.embeddings(inputs_)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x



In [40]:

# Code taken from here:
# https://github.com/OlgaChernytska/word2vec-pytorch/blob/main/utils/trainer.py

class Trainer:

    def __init__(
        self,
        model,
        epochs,
        train_dataloader,
        train_steps,
        val_dataloader,
        val_steps,
        checkpoint_frequency,
        criterion,
        optimizer,
        lr_scheduler,
        device,
        model_dir,
        model_name
    ):
        self.model = model
        self.epochs = epochs
        self.train_dataloader = train_dataloader
        self.train_steps = train_steps
        self.val_dataloader = val_dataloader
        self.val_steps = val_steps
        self.criterion = criterion
        self.optimizer = optimizer
        self.checkpoint_frequency = checkpoint_frequency
        self.lr_scheduler = lr_scheduler
        self.device = device
        self.model_dir = model_dir
        self.model_name = model_name


    def train(self):
        for epoch in range(self.epochs):
            self._train_epoch()
            self._validate_epoch()

            print(
                "Epoch: {}/{}, Train Loss={:.5f}, Val Loss={:.5f}".format(
                    epoch + 1,
                    self.epochs,
                    self.loss["train"][-1],
                    self.loss["val"][-1]
                )
            )

            self.lr_scheduler.step()

            if self.checkpoint_frequency:
                self._save_checkpoint(epoch)

    
    def _train_epoch(self):
        self.model.train()

        running_loss = []

        for i, batch_data in enumerate(self.train_dataloader, 1):
            inputs = batch_data[0].to(self.device)
            labels = batch_data[1].to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(inputs)
            loss = self.criterion(outputs, labels)
            loss.backward()
            self.optimizer.step()

            running_loss.append(loss.item())

            if i == self.train_steps:
                break

        epoch_loss = np.mean(running_loss)
        self.loss["train"].append(epoch_loss)

    def _validate_epochs(self):
        self.model.eval()
        running_loss = []

        with torch.no_grad():
            for i, batch_data in enumerate(self.val_dataloader, 1):
                inputs = batch_data[0].to(self.device)
                labels = batch_data[1].to(self.device)

                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)

                running_loss.append(loss.item())

                if i == self.val_steps:
                    break

        epoch_loss = np.mean(running_loss)
        self.loss["val"].append(epoch_loss)

    
    def _save_checkpoint(self, epoch):
        epoch_num = epoch + 1

        if epoch_num % self.checkpoint_frequency == 0:
            model_path = "checkpoint_{}.pt".format(str(epoch_num).zfill(3))
            modal_path = os.path.join(self.model_dir, model_path)
            torch.save(self.model, model_path)

    
    def save_model(self):
        modal_path = os.path.join(self.model_dir, "model.pt")
        torch.save(self.model, model_path)


    def save_loss(self):
        loss_path = os.path.join(self.model_dir, "loss.json")
        with open(loss_path, "w") as fp:
            json.dump(self.loss, fp)
        

In [None]:
EPOCHS=2
LR=0.01

model = CBOWModel(vocab_size=len(vocab))

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

trainer = Trainer(
    model=model,
)

# TODO: HERE I AM, BUILDING THE TRAINING PROGRAM!


    