# Assignment 1.3: Naive word2vec (40 points)

This task can be formulated very simply. Follow this [paper](https://arxiv.org/pdf/1411.2738.pdf) and implement word2vec like a two-layer neural network with matrices $W$ and $W'$. One matrix projects words to low-dimensional 'hidden' space and the other - back to high-dimensional vocabulary space.

![word2vec](https://i.stack.imgur.com/6eVXZ.jpg)

You can use TensorFlow/PyTorch and code from your previous task.

## Results of this task: (30 points)
 * trained word vectors (mention somewhere, how long it took to train)
 * plotted loss (so we can see that it has converged)
 * function to map token to corresponding word vector
 * beautiful visualizations (PCE, T-SNE), you can use TensorBoard and play with your vectors in 3D (don't forget to add screenshots to the task)

## Extra questions: (10 points)
 * Intrinsic evaluation: you can find datasets [here](http://download.tensorflow.org/data/questions-words.txt)
 * Extrinsic evaluation: you can use [these](https://medium.com/@dataturks/rare-text-classification-open-datasets-9d340c8c508e)

Also, you can find any other datasets for quantitative evaluation.

Again. It is **highly recommended** to read this [paper](https://arxiv.org/pdf/1411.2738.pdf)

Example of visualization in tensorboard:
https://projector.tensorflow.org

Example of 2D visualisation:

![2dword2vec](https://www.tensorflow.org/images/tsne.png)

In [0]:
# TODO CrossEntropy пофиксить
# Допилить лосс функцию.
# Сделать функцию, которая вытаскивает вектор по нидексу слова.

In [0]:
from collections import Counter
from tqdm import tqdm_notebook

import numpy as np
import spacy
from spacy.symbols import ORTH

spacy_en = spacy.load('en')
spacy_en.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "not"}])
spacy_en.tokenizer.add_special_case("didn't", [{ORTH: "did"}, {ORTH: "not"}]) #adding special case so that tokenizer("""don't""") != 'do'

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [0]:
!wget http://mattmahoney.net/dc/text8.zip
!unzip text8.zip

--2020-02-24 20:50:03--  http://mattmahoney.net/dc/text8.zip
Resolving mattmahoney.net (mattmahoney.net)... 67.195.197.75
Connecting to mattmahoney.net (mattmahoney.net)|67.195.197.75|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31344016 (30M) [application/zip]
Saving to: ‘text8.zip’


2020-02-24 20:50:18 (2.13 MB/s) - ‘text8.zip’ saved [31344016/31344016]

Archive:  text8.zip
  inflating: text8                   


In [0]:
unk_token = '<unk>'
pad_token = '<pad>'
BATCH_SIZE = 64
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Batch and Loader stuff

In [0]:
# Opening data
with open('text8', encoding='utf-8') as f:
    text_original = f.read()
print(text_original[:100])

 anarchism originated as a term of abuse first used against early working class radicals including t


In [0]:
# Preprocessing stuff
def tokenizer(text):
    """
    return: list of lemmas (without punctuation and numbers)
    """
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]

tokens = tokenizer(text_original)
print(len(tokens), len(set(tokens)))
print(tokens[:10])

17008373 253830
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [0]:
class Batcher(Dataset):
    """
    Preprocessed list of tokens passed  here
    """

    def __init__(self, tokens, vocab_size):

        super().__init__()

        self.tokens = tokens
        self.tokens_freq = []
        self.vocab_size = vocab_size

        self.word2index = {}
        self.index2word = {}

        print('Initial length of tokens: {}'.format(self.vocab_size))

        self.build_vocab(min_freq=5)
        self.numericalization()
        self.x, self.y = self.cbow_batching(batch_size=64, window_size=4)

    def __len__(self):        
        return self.x.shape[0]

    def __getitem__(self, idx):
        
        x = self.x[idx]
        #x = torch.FloatTensor(x) # преобразуем в тензор с флоат величинами
        y = self.y[idx]        
        return x, y
    
    
    def build_vocab(self, min_freq = 10):
        """
        builds vocab (self.tokens_freq) from self.tokens
        param: min_freq (int) - minimum frequency for token in list to get to vocab
        """
        counter = Counter(self.tokens)
        mask = list(map(lambda x: x[1] > min_freq, counter.items()))
        self.tokens_freq = np.array(list(counter.items()))[mask]
        self.tokens_freq = list(map(lambda x: x[0], self.tokens_freq)) + [unk_token] + [pad_token]
        self.vocab_size = len(self.tokens_freq)

        print('After building vocab, vocab_size: {}'.format(self.vocab_size))

    def numericalization(self):
        """
        creates word2index and index2word, replaces not frequent tokens with 'unk' token
        """
        self.word2index = {word : ind for ind, word in enumerate(self.tokens_freq)}
        self.index2word = {value : key for key, value in self.word2index.items()}



        self.tokens = [self.word2index[token] if token in self.word2index else self.word2index[unk_token] 
                       for token in self.tokens]

        print('Numeralization done. Example of self.tokens: {}'.format(self.tokens[:10]))          
        

    def cbow_batching(self, batch_size, window_size):
        """
        adds pad_token, creates batches
        """
        
        self.tokens = [self.word2index[pad_token]] * window_size + self.tokens + [self.word2index[pad_token]] * window_size
        x_batches = []
        y_batches = []

        for i in np.arange(window_size, len(self.tokens)-window_size):
            y_batches.append(self.tokens[i])

            context = self.tokens[i-window_size:i] + self.tokens[i+1:i+1+window_size]
            x_batches.append(context)
        x_batches = np.array(x_batches)
        y_batches = np.array(y_batches)

        try:
            x_batches = x_batches.reshape((-1, batch_size,2*window_size))
            y_batches = y_batches.reshape((-1,batch_size))
        except Exception:
            print('Could not reshape directly so deleted something')
            total = len(y_batches)
            x_batches = x_batches[:-(total % batch_size),:]
            y_batches = y_batches[:-(total % batch_size)]

            x_batches = x_batches.reshape((-1, batch_size,2*window_size))
            y_batches = y_batches.reshape((-1,batch_size))

        return x_batches, y_batches

    def to_pytorch(self, x_batches, y_batches):
        x_batches = torch.from_numpy(x_batches)
        y_batches = torch.from_numpy(y_batches)
        return x_batches, y_batches

In [0]:
batcher=Batcher(tokens=tokens, vocab_size=len(tokens))
batcher.x, batcher.y = batcher.to_pytorch(batcher.x, batcher.y)

Initial length of tokens: 17008373
After building vocab, vocab_size: 63632
Numeralization done. Example of self.tokens: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Could not reshape directly so deleted something


In [0]:
batcher.x.size(), batcher.y.size()

(torch.Size([265755, 64, 8]), torch.Size([265755, 64]))

In [0]:
x, y = batcher.x, batcher.y
x[0][4], y[0][4]

(tensor([0, 1, 2, 3, 5, 6, 7, 8]), tensor(4))

In [0]:
#loader =  DataLoader(batcher, batch_size=BATCH_SIZE, shuffle=True)
train, test = torch.utils.data.random_split(batcher, [int(batcher.x.size()[0]*0.8), int(batcher.x.size()[0]*0.2)])

In [0]:
train_loader = DataLoader(train,  shuffle=True) #batch_size = 1, потому что уже внутри batch.x и batch.y есть разделение на батчи
test_loader = DataLoader(test, shuffle=True)

In [0]:
for x, y in train_loader:
  print(x.size(), y.size())
  break

torch.Size([1, 64, 8]) torch.Size([1, 64])


# Model

trained word vectors (mention somewhere, how long it took to train)

plotted loss (so we can see that it has converged)

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(MyModel, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=batcher.word2index['<pad>'])
        self.relu = torch.nn.ReLU()
        
        self.fc1 = nn.Linear(embed_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x):    

        x = x.squeeze(0)
        print(x.size())
        x = self.embedding(x)
        print(x.size())
        x = self.fc1(x)
        print(x.size())
        x = self.relu(x)
        x = self.fc2(x)
        print(x.size())

        return x

In [0]:
torch.cuda.empty_cache()

num_epochs = 10

model = MyModel(vocab_size=batcher.vocab_size, embed_size=100, hidden_size=256)
model = model.to(DEVICE)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
criterion = nn.CrossEntropyLoss(ignore_index=batcher.word2index['<pad>'])
criterion.cuda()

CrossEntropyLoss()

In [0]:
for x, y in train_loader:
  print(model(x).size(), y.squeeze(0).unsqueeze(1).size())
  break #посмотрите внимательно на размерность

torch.Size([64, 8])
torch.Size([64, 8, 100])
torch.Size([64, 8, 256])
torch.Size([64, 8, 63632])
torch.Size([64, 8, 63632]) torch.Size([64, 1])


In [0]:
len(batcher.word2index), batcher.vocab_size, len(train_loader)

(63632, 63632, 212604)

In [0]:
def train_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm_notebook(data_iter, total=len(train_loader))
    counter = 0
    for x,y in train_loader:

        y =  y.squeeze(0)
        
        out = model.forward(x)
        print(y.size())
        loss = criterion(out, y).item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)

        optimizer.step()
        #optimizer.zero_grad()
        
        total_loss += loss
        data_iter.set_postfix(loss = loss.item())
        counter +=1
        
    total_loss /= counter
    return total_loss

def valid_epoch(data_iter, model, criterion):
    total_loss = 0
    data_iter = tqdm_notebook(data_iter, total=len(test_loader))
    counter = 0
    for x,y in test_loader:

        y = y.squeeze(0)
        
        out = model.forward(x)
        loss = criterion(out, y).item()
        
        total_loss += loss
        data_iter.set_postfix(loss = loss)
        counter +=1
        
    total_loss /= counter
    return total_loss

for epoch in range(num_epochs):
    model.train()
    loss = train_epoch(train_loader, model, criterion).item()
    print('train', loss)
    
    model.eval()
    with torch.no_grad():
        loss = valid_epoch(test_loader, model, criterion)
        scheduler.step(loss)
        print('valid', loss)

HBox(children=(IntProgress(value=0, max=212604), HTML(value='')))

torch.Size([64, 8])


RuntimeError: ignored