In [1]:
!pip install tsundoku



In [2]:
!pip install sklearn torch tqdm nltk lazyme ansi requests gensim



In [3]:
# before start the kernel, change the environment variable $NLTK_DATA (linux) or %NLTK_DATA% (windows) to the desired data path
!python -m nltk.downloader -d %NLTK_DATA% movie_reviews punkt popular

import nltk
# or
# nltk.download("popular")
# nltk.download("punkt")
# nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     e:/dataset/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to e:/dataset/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     e:/dataset/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     e:/dataset/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     e:/dataset/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     e:/dataset/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_dat

In [4]:
from IPython.display import display, Markdown, Latex
from tsundoku.word2vec_hints import *

# Overview


- <a href="#section-3-0">**3.0. Data Preparation**</a>
  - <a href="#section-3-0-1">3.0.1. *Vocabulary*</a>
    - <a href="#section-3-0-1-a"> Pet Peeve: using `gensim`</a>
  - <a href="#section-3-0-2">3.0.2. *Dataset*</a>  (<a href="#section-3-0-2-hints">Hints</a>)
    - <a href="#section-3-0-2-return-dict">Return `dict` in `__getitem__()`</a>
    - <a href="#section-3-0-2-labeleddata">Try `LabeledDataset`</a>
<br><br>
- <a href="#section-3-1">**3.1. Word2Vec from Scratch**</a>
  - <a href="#section-3-1-1">3.1.1. *CBOW*</a>
  - <a href="#section-3-1-2">3.1.2. *Skipgram*</a>
  - <a href="#section-3-1-3">3.1.3. *Word2Vec Dataset*</a> (<a href="#section-3-1-3-hint">Hints</a>)
  - <a href="#section-3-1-4-hint">3.1.4. *Train a CBOW model*</a>
    - <a href="#section-3-1-4-fill-cbow">The CBOW model</a>
    - <a href="#section-3-1-4-train-cbow">Train the model (*for real*)</a>
    - <a href="#section-3-1-4-evaluate-cbow">Evaluate the model</a>
    - <a href="#section-3-1-4-load-model">Load model at specific epoch</a>
  - <a href="#section-3-1-5">3.1.5. *Train a Skipgram model*</a>
    - <a href="#section-3-1-5-forward">Take a closer look at `forward()`</a>
    - <a href="#section-3-1-5-train">Train the model (*for real*)</a>
    - <a href="section-3-1-5-evaluate">Evaluate the model</a>
  - <a href="#section-3-1-6">3.1.6. *Loading Pre-trained Embeddings*</a>
    - <a href="#section-3-1-6-vocab">Override the Embedding vocabulary</a>
    - <a href="#section-3-1-6-pretrained">Override the Embedding weights</a>
    - <a href="#section-3-1-6-eval-skipgram">Evaluate on the Skipgram task</a>
    - <a href="#section-3-1-6-eval-cbow">Evaluate on the CBOW task</a>
    - <a href="#section-3-1-6-unfreeze-finetune">Unfreeeze and finetune</a>
    - <a href="#section-3-1-6-reval-cbow">Re-evaluate on the CBOW task</a>
<br><br>


<a id="section-3-0"></a>
# 3.0. Data Preparation

Before we train our own embeddings, lets first understand how to read text data into pytorch.
The native pytorch way to load datasets is to use the `torch.utils.data.Dataset` object.

There are already several other libraries that help with loading text datasets, e.g. 

 - FastAI https://docs.fast.ai/text.data.html
 - AllenNLP https://allenai.github.io/allennlp-docs/api/allennlp.data.dataset.html
 - Torch Text https://github.com/pytorch/text#data
 - Texar https://texar.readthedocs.io/en/latest/code/data.html#id4 
 - SpaCy https://github.com/explosion/thinc
 

But to truly understand and use it for the custom datasets you'll see at work, lets learn it the native way.

<a id="section-3-0-1"></a>
## 3.0.1  Vocabulary

Given a text, the first thing to do is to build a vocabulary (i.e. a dictionary of unique words) and assign an index to each unique word.

In [5]:
import random
from itertools import chain

from tqdm import tqdm
from gensim.corpora import Dictionary

import torch
from torch import nn, optim, tensor, autograd
from torch.nn import functional as F
from torch.utils.data import Dataset
import numpy as np

from functools import partial
from torch.utils.data import Dataset, DataLoader
from torch import functional as F


In [6]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    print("It's working: {}".format(word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])))
except Exception as ex: # Use a naive sentence tokenizer and toktok.
    print(ex)
    print("something is wrong...")
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize


It's working: ['This', 'is', 'a', 'foobar', 'sentence', '.']


In [7]:

text = """Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in corpora, 
the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a relation 
between two phenomena is demonstrably non-random, does not support the inference 
that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis testing 
has been used, and show how it has often led to unhelpful or misleading results.""".lower()

tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(text)]
print("tokenized_text.len: {}".format(len(tokenized_text)))
for i, each in enumerate(tokenized_text):
    print("sentense[{}].len: {}".format(i, len(each)))

    
uniq_tokens = set(chain(*tokenized_text))
print("uniq_tokens.len: {}".format(len(uniq_tokens)))

vocab = {}   # Assign indices to every word.
idx2tok = {} # Also keep an dict of index to words.
for i, token in enumerate(uniq_tokens):
    vocab[token] = i
    idx2tok[i] = token

tokenized_text.len: 7
sentense[0].len: 13
sentense[1].len: 12
sentense[2].len: 19
sentense[3].len: 24
sentense[4].len: 35
sentense[5].len: 17
sentense[6].len: 24
uniq_tokens.len: 87


In [8]:
vocab

{'unhelpful': 0,
 'when': 1,
 'which': 2,
 'relation': 3,
 ',': 4,
 'will': 5,
 'word': 6,
 'studies': 7,
 'systematically': 8,
 'where': 9,
 'arbitrary': 10,
 'moreover': 11,
 'always': 12,
 'users': 13,
 '(': 14,
 'non-random': 15,
 'phenomena': 16,
 'so': 17,
 'inference': 18,
 'the': 19,
 'fact': 20,
 'essentially': 21,
 'demonstrably': 22,
 'statistical': 23,
 'has': 24,
 'not': 25,
 'able': 26,
 'randomly': 27,
 ')': 28,
 'at': 29,
 'does': 30,
 'and': 31,
 'shall': 32,
 'hypothesis': 33,
 'corpus': 34,
 'look': 35,
 'how': 36,
 'do': 37,
 '.': 38,
 'been': 39,
 'review': 40,
 'is': 41,
 'establish': 42,
 'linguistic': 43,
 'have': 44,
 'language': 45,
 'support': 46,
 'literature': 47,
 'frequently': 48,
 'null': 49,
 'associations': 50,
 'uses': 51,
 'randomness': 52,
 'there': 53,
 'present': 54,
 'true': 55,
 'choose': 56,
 'frequencies': 57,
 'show': 58,
 'often': 59,
 'be': 60,
 'evidence': 61,
 'almost': 62,
 'used': 63,
 'experimental': 64,
 'we': 65,
 'testing': 66,
 'in

In [9]:
# Retrieve the index of the word 'corpora'
vocab['corpora']

86

In [10]:
# The indexed representation of the first sentence.

sent0 = tokenized_text[0]

[vocab[token] for token in sent0] 

[45, 13, 68, 56, 79, 27, 4, 31, 45, 41, 21, 15, 38]

<a id="section-3-0-1-a"></a>

### Pet Peeve (Gensim)

I (Liling) don't really like to write my own vectorizer the `gensim` has functions that are optimized for such operations. In fact, I've written a [whole preprocessing pipeline library for me to use for language modelling and machine translation purposes](https://github.com/alvations/komorebi/blob/master/komorebi/text.py) =)

Using `gensim`, I would have written the above as such:

In [8]:
from gensim.corpora.dictionary import Dictionary

In [7]:
vocab = Dictionary(tokenized_text)

NameError: name 'tokenized_text' is not defined

In [12]:
# Note the key-value order is different of gensim from the native Python's
dict(vocab.items())

{0: ',',
 1: '.',
 2: 'and',
 3: 'choose',
 4: 'essentially',
 5: 'is',
 6: 'language',
 7: 'never',
 8: 'non-random',
 9: 'randomly',
 10: 'users',
 11: 'words',
 12: 'a',
 13: 'hypothesis',
 14: 'null',
 15: 'posits',
 16: 'randomness',
 17: 'statistical',
 18: 'testing',
 19: 'uses',
 20: 'which',
 21: 'at',
 22: 'be',
 23: 'corpora',
 24: 'hence',
 25: 'in',
 26: 'linguistic',
 27: 'look',
 28: 'phenomena',
 29: 'the',
 30: 'true',
 31: 'we',
 32: 'when',
 33: 'will',
 34: '(',
 35: ')',
 36: 'able',
 37: 'almost',
 38: 'always',
 39: 'data',
 40: 'enough',
 41: 'establish',
 42: 'it',
 43: 'moreover',
 44: 'not',
 45: 'shall',
 46: 'that',
 47: 'there',
 48: 'to',
 49: 'where',
 50: 'arbitrary',
 51: 'between',
 52: 'corpus',
 53: 'demonstrably',
 54: 'do',
 55: 'does',
 56: 'fact',
 57: 'frequently',
 58: 'have',
 59: 'inference',
 60: 'relation',
 61: 'so',
 62: 'studies',
 63: 'support',
 64: 'two',
 65: 'are',
 66: 'associations',
 67: 'evidence',
 68: 'experimental',
 69: 'fr

In [13]:
vocab.token2id['corpora']

23

In [14]:
vocab.doc2idx(sent0)

[6, 10, 7, 3, 11, 9, 0, 2, 6, 5, 4, 8, 1]

The "indexed form" of the tokens in the sentence forms the ***vectorized*** input to the `nn.Embedding` layer in PyTorch.

<a id="section-3-0-2"></a>

# 3.0.2 Dataset

Lets try creating a `torch.utils.data.Dataset` object.

In [9]:
from torch.utils.data import Dataset, DataLoader

class Text(Dataset):
    def __init__(self, tokenized_texts):
        """
        :param tokenized_texts: Tokenized text.
        :type tokenized_texts: list(list(str))
        """
        self.sents = tokenized_texts
        self.vocab = Dictionary(tokenized_texts)

    def __getitem__(self, index):
        """
        Each data input, in this case, is a non-padded vectorised sentence (indices of tokens).
        Because of non-padding, each X has different length
        
        :param index: Index to the data point.
        :type index: int
        """
        # Hint: You want to return a vectorized sentence here.
        return {'x': self.vectorize(self.sents[index])}

    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        return self.vocab.doc2idx(tokens)
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]

<a id="section-3-0-2-hints"></a>
## Hints to the above cell

In [16]:
# Option 1: To see the hint and partial code for the cell above, uncomment the following line.
#hint_dataset_vectorize()
#code_text_dataset_vectorize()

# Option 2: "I give up just, run the code for me" 
# Uncomment the next two lines, if you really gave up... 
#full_code_text_dataset_vectorize()
#from tsundoku.word2vec import Text


In [17]:
text_dataset = Text(tokenized_text)

In [18]:
text_dataset[0] # First sentence.

{'x': [6, 10, 7, 3, 11, 9, 0, 2, 6, 5, 4, 8, 1]}

<a id="section-3-0-2-return-dict"></a>

### Return `dict` in `__getitem__()`

This is nice if we're just representing sentences/documents by their indices but when we're doing machine learning, we usually have `X` and `Y`. 

If we have labels for the each sentence, we can also put it into to `__getitem__()` by having it return a dictionary.

In [10]:
from torch.utils.data import Dataset, DataLoader

class LabeledText(Dataset):
    def __init__(self, tokenized_texts, labels):
        """
        :param tokenized_texts: Tokenized text.
        :type tokenized_texts: list(list(str))
        """
        self.sents = tokenized_texts
        self.labels = labels # Sentence level labels.
        self.vocab = Dictionary(self.sents)

    def __getitem__(self, index):
        """
        The primary entry point for PyTorch datasets.
        This is were you access the specific data row you want.
        
        Each data input, in this case, is a non-padded vectorised sentence (indices of tokens), and its label Y.
        Because of non-padding, each X has different length
        
        :param index: Index to the data point.
        :type index: int
        """
        return {
            'x': self.vectorize(self.sents[index]), 
            'y': self.labels[index]
        }

    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        return self.vocab.doc2idx(tokens)
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]

<a id="section-3-0-2-labeleddata"></a>

### Lets try the `LabeledDataset` on a movie review corpus

In [11]:
from nltk.corpus import movie_reviews

In [21]:
documents = []
labels = []

for fileid in tqdm(movie_reviews.fileids()):
    label = fileid.split('/')[0]
    doc = word_tokenize(movie_reviews.open(fileid).read())
    documents.append(doc)
    labels.append(label)

100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:06<00:00, 290.93it/s]


In [22]:
print("len(review docs): {}".format(len(documents)))
print("len(docs[0]) i.e. tokens: {}".format(len(documents[0])))
print("len(labels): {}".format(len(labels)))
print("first 5 labels: {}".format(labels[:5]))

len(review docs): 2000
len(docs[0]) i.e. tokens: 841
len(labels): 2000
first 5 labels: ['neg', 'neg', 'neg', 'neg', 'neg']


In [23]:
labeled_dataset = LabeledText(documents, labels)

In [24]:
print(labeled_dataset[0])  # First review in the data.

{'x': [243, 17, 314, 294, 77, 140, 307, 20, 68, 237, 6, 97, 34, 299, 98, 8, 302, 135, 167, 33, 22, 8, 226, 220, 297, 145, 87, 6, 60, 158, 136, 74, 307, 262, 157, 165, 153, 179, 6, 34, 149, 214, 8, 333, 2, 297, 82, 18, 326, 297, 204, 34, 19, 280, 19, 124, 230, 8, 8, 8, 79, 17, 20, 199, 204, 129, 297, 294, 133, 296, 311, 225, 20, 322, 75, 164, 6, 60, 245, 169, 165, 20, 322, 46, 234, 8, 337, 168, 333, 188, 304, 253, 33, 108, 148, 226, 307, 345, 6, 272, 163, 132, 37, 122, 337, 42, 307, 59, 297, 201, 6, 196, 341, 348, 152, 34, 290, 4, 185, 156, 1, 195, 5, 6, 60, 300, 38, 142, 34, 46, 328, 220, 189, 28, 315, 220, 122, 6, 34, 301, 128, 173, 86, 208, 276, 304, 226, 76, 8, 302, 263, 307, 150, 293, 304, 246, 209, 72, 6, 60, 113, 169, 295, 8, 277, 333, 38, 297, 248, 341, 297, 204, 18, 331, 6, 170, 186, 247, 168, 296, 169, 2, 271, 309, 172, 8, 169, 282, 221, 19, 216, 19, 60, 299, 95, 167, 304, 19, 116, 19, 342, 165, 337, 347, 6, 40, 33, 43, 194, 6, 150, 215, 164, 333, 2, 141, 225, 8, 300, 38, 96, 

In [25]:
print(labeled_dataset[0]['x'])  # First review in vectorized index format.

[243, 17, 314, 294, 77, 140, 307, 20, 68, 237, 6, 97, 34, 299, 98, 8, 302, 135, 167, 33, 22, 8, 226, 220, 297, 145, 87, 6, 60, 158, 136, 74, 307, 262, 157, 165, 153, 179, 6, 34, 149, 214, 8, 333, 2, 297, 82, 18, 326, 297, 204, 34, 19, 280, 19, 124, 230, 8, 8, 8, 79, 17, 20, 199, 204, 129, 297, 294, 133, 296, 311, 225, 20, 322, 75, 164, 6, 60, 245, 169, 165, 20, 322, 46, 234, 8, 337, 168, 333, 188, 304, 253, 33, 108, 148, 226, 307, 345, 6, 272, 163, 132, 37, 122, 337, 42, 307, 59, 297, 201, 6, 196, 341, 348, 152, 34, 290, 4, 185, 156, 1, 195, 5, 6, 60, 300, 38, 142, 34, 46, 328, 220, 189, 28, 315, 220, 122, 6, 34, 301, 128, 173, 86, 208, 276, 304, 226, 76, 8, 302, 263, 307, 150, 293, 304, 246, 209, 72, 6, 60, 113, 169, 295, 8, 277, 333, 38, 297, 248, 341, 297, 204, 18, 331, 6, 170, 186, 247, 168, 296, 169, 2, 271, 309, 172, 8, 169, 282, 221, 19, 216, 19, 60, 299, 95, 167, 304, 19, 116, 19, 342, 165, 337, 347, 6, 40, 33, 43, 194, 6, 150, 215, 164, 333, 2, 141, 225, 8, 300, 38, 96, 6, 300

In [26]:
print(labeled_dataset[0]['y'])  # Label of the first review in the data. 

neg


<a id="section-3-1"></a>

# 3.1 Word2Vec Training

Word2Vec has two training variants:

 - **Continuous Bag of Words (CBOW)**: Predict center word from (bag of) context words.
 - **Skip-grams**: Predict context words given center word.
  
Visually, they look like this:



<img src="https://lilianweng.github.io/lil-log/assets/images/word2vec-cbow.png" width="500" align="left">

Fig. 1. The skip-gram model. Both the input vector xx and the output yy are one-hot encoded word representations. <br>The hidden layer is the word embedding of size NN.



<img src="https://lilianweng.github.io/lil-log/assets/images/word2vec-skip-gram.png" width="500" align="left">

Fig. 2. The CBOW model. Word vectors of multiple context words are averaged to get a fixed-length vector as in the hidden layer. Other symbols have the same meanings as in Fig 1.

(Pretty network images above are from [https://lilianweng.github.io](https://lilianweng.github.io/lil-log/2017/10/15/learning-word-embedding.html#context-based-continuous-bag-of-words-cbow))

<a id="section-3-1-1"></a>

## 3.1.1. CBOW

CBOW windows through the sentence and picks out the center word as the `Y` and the surrounding context words as the inputs `X`. 


CBOW uses the neighbouring 4 tokens to predict the center token, i.e. every input X has dim of 4, and y has dim of 1.

input lookup matrix X shape: (4, V)
embedding matrix W shape: (V, N)   V: vocabulary size, N: embedding size
intermediate layer H = input lookup matrix X multiply embedding matrix W, in shape (4, N), then flatten to (1, 4*N)
intermediate layer H then multiplys Matrix W' to product softmax output


In [12]:
from lazyme import per_window, per_chunk

xx =[1,2,3,4,5]
print("use per_window to create windows for {}".format(xx))
print(list(per_window(xx, n=2)))
print(list(per_window(xx, n=3)))
print(list(per_window(xx, n=10)))
print()
print("use per_chunk to create windows for {}".format(xx))
print(list(per_chunk(xx, n=2)))
print(list(per_chunk(xx, n=3)))
print(list(per_chunk(xx, n=4)))
print(list(per_chunk(xx, n=10)))

use per_window to create windows for [1, 2, 3, 4, 5]
[(1, 2), (2, 3), (3, 4), (4, 5)]
[(1, 2, 3), (2, 3, 4), (3, 4, 5)]
[]

use per_chunk to create windows for [1, 2, 3, 4, 5]
[(1, 2), (3, 4), (5, None)]
[(1, 2, 3), (4, 5, None)]
[(1, 2, 3, 4), (5, None, None, None)]
[(1, 2, 3, 4, 5, None, None, None, None, None)]


In [13]:
def per_window(sequence, n=1):
    """
    From http://stackoverflow.com/q/42220614/610569
        >>> list(per_window([1,2,3,4], n=2))
        [(1, 2), (2, 3), (3, 4)]
        >>> list(per_window([1,2,3,4], n=3))
        [(1, 2, 3), (2, 3, 4)]
    """
    start, stop = 0, n
    seq = list(sequence)
    while stop <= len(seq):
        yield seq[start:stop]
        start += 1
        stop += 1

def cbow_iterator(tokens, window_size):
    n = window_size * 2 + 1
    for window in per_window(tokens, n):
        target = window.pop(window_size)
        yield window, target   # X = window ; Y = target. 


In [29]:
xx =[1,2,3,4,5]
print("use per_window to create windows for {}".format(xx))
print(list(per_window(xx, n=2)))


use per_window to create windows for [1, 2, 3, 4, 5]
[[1, 2], [2, 3], [3, 4], [4, 5]]


In [30]:
sent0 = ['language', 'users', 'never', 'choose', 'words', 'randomly', ',', 
         'and', 'language', 'is', 'essentially', 'non-random', '.']

In [31]:
list(cbow_iterator(sent0, 2)) 

[(['language', 'users', 'choose', 'words'], 'never'),
 (['users', 'never', 'words', 'randomly'], 'choose'),
 (['never', 'choose', 'randomly', ','], 'words'),
 (['choose', 'words', ',', 'and'], 'randomly'),
 (['words', 'randomly', 'and', 'language'], ','),
 (['randomly', ',', 'language', 'is'], 'and'),
 ([',', 'and', 'is', 'essentially'], 'language'),
 (['and', 'language', 'essentially', 'non-random'], 'is'),
 (['language', 'is', 'non-random', '.'], 'essentially')]

In [32]:
list(cbow_iterator(sent0, 3)) 

[(['language', 'users', 'never', 'words', 'randomly', ','], 'choose'),
 (['users', 'never', 'choose', 'randomly', ',', 'and'], 'words'),
 (['never', 'choose', 'words', ',', 'and', 'language'], 'randomly'),
 (['choose', 'words', 'randomly', 'and', 'language', 'is'], ','),
 (['words', 'randomly', ',', 'language', 'is', 'essentially'], 'and'),
 (['randomly', ',', 'and', 'is', 'essentially', 'non-random'], 'language'),
 ([',', 'and', 'language', 'essentially', 'non-random', '.'], 'is')]

<a id="section-3-1-2"></a>

## 3.1.2. Skipgram

Skipgram training windows through the sentence and pictures out the center word as the input `X` and the context words as the outputs `Y`, additionally, it will randommly sample words not in the window as **negative samples**.

In [14]:
def skipgram_iterator(tokens, window_size):
    n = window_size * 2 + 1 
    for i, window in enumerate(per_window(tokens, n)):
        target = window.pop(window_size)
        # Generate positive samples.
        for context_word in window:
            yield target, context_word, 1
        # Generate negative samples.
        for _ in range(n-1):
            leftovers = tokens[:i] + tokens[i+n:]
            yield target, random.choice(leftovers), 0

In [34]:
list(skipgram_iterator(sent0, 2))

[('never', 'language', 1),
 ('never', 'users', 1),
 ('never', 'choose', 1),
 ('never', 'words', 1),
 ('never', 'and', 0),
 ('never', 'essentially', 0),
 ('never', 'essentially', 0),
 ('never', '.', 0),
 ('choose', 'users', 1),
 ('choose', 'never', 1),
 ('choose', 'words', 1),
 ('choose', 'randomly', 1),
 ('choose', '.', 0),
 ('choose', 'is', 0),
 ('choose', 'language', 0),
 ('choose', ',', 0),
 ('words', 'never', 1),
 ('words', 'choose', 1),
 ('words', 'randomly', 1),
 ('words', ',', 1),
 ('words', 'language', 0),
 ('words', 'non-random', 0),
 ('words', 'essentially', 0),
 ('words', 'language', 0),
 ('randomly', 'choose', 1),
 ('randomly', 'words', 1),
 ('randomly', ',', 1),
 ('randomly', 'and', 1),
 ('randomly', 'language', 0),
 ('randomly', 'non-random', 0),
 ('randomly', 'language', 0),
 ('randomly', 'never', 0),
 (',', 'words', 1),
 (',', 'randomly', 1),
 (',', 'and', 1),
 (',', 'language', 1),
 (',', 'non-random', 0),
 (',', 'language', 0),
 (',', 'choose', 0),
 (',', 'never', 0),

## Cut-away: What is `partial`?

The [`functools.partial`](https://docs.python.org/3.7/library/functools.html#functools.partial) function in Python is a mechanism to overload a function with preset arguments. 

For example:

In [15]:
from nltk import ngrams

# Generates bigrams
list(ngrams('this is a sentence'.split(), n=2))

[('this', 'is'), ('is', 'a'), ('a', 'sentence')]

In [16]:
from functools import partial

# You can create a new function that "preset" the `n` argument, e.g.
bigrams = partial(ngrams, n=2)
trigrams = partial(ngrams, n=3)

In [37]:
list(trigrams('this is a sentence'.split()))

[('this', 'is', 'a'), ('is', 'a', 'sentence')]

In [38]:
list(bigrams('this is a sentence'.split()))

[('this', 'is'), ('is', 'a'), ('a', 'sentence')]

<a id="section-3-1-3"></a>

## 3.1.3 Word2Vec Dataset

Now that we know what are the inputs `X` and outputs `Y` of the Word2Vec task. 

Lets put everything together and modify the `Dataset` so that `__getitem__` retrieves CBOW or Skipgram formats.

In [17]:

class Word2VecText(Dataset):
    def __init__(self, tokenized_texts, window_size=2, variant="cbow"):
        """
        :param tokenized_texts: Tokenized text.
        :type tokenized_texts: list(list(str))
        """
        self.sents = tokenized_texts
        self._len = len(self.sents)
        self.vocab = Dictionary(self.sents)
        self.window_size = window_size
        self.variant = variant
        if variant.lower() == 'cbow':
            self._iterator = partial(self.cbow_iterator, window_size=self.window_size)
        elif variant.lower() == 'skipgram':
            self._iterator = partial(self.skipgram_iterator, window_size=self.window_size)

    def __getitem__(self, index):
        """
        The primary entry point for PyTorch datasets.
        This is were you access the specific data row you want.
        
        Each item is a list of windows for a sentence/document. 
        Each window is composed of a dictionary of x = window vector, y = target value at the center of the window.
        CBOW e.g.
        [
            {'x': [x1,x2,x4,x5], 'y': x3},
            {'x': [x2,x3,x5,x6], 'y': x4},
            ...
        ]
        
        SkipGram e.g.
        [
            {'x': (x1, x3), 'y': 1},
            {'x': (x2, x3), 'y': 1},
            {'x': (x4, x3), 'y': 1},
            {'x': (x5, x3), 'y': 1},
            {'x': (x9, x3), 'y': 0},
            {'x': (x10, x3), 'y': 0},
            ...
        ]
        

        :param index: Index to the data point.
        :type index: int
        """
        vectorized_sent = self.vectorize(self.sents[index])
        return list(self._iterator(vectorized_sent))

    def __len__(self):
        return self._len

    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized.
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx
        return self.vocab.doc2idx(tokens)

    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]

    def cbow_iterator(self, sent_tokens, window_size):
        n = window_size * 2 + 1
        for window in per_window(sent_tokens, n):
            window = list(window)
            target = window.pop(window_size)
            yield {"x": window, "y": target}   # X = window ; Y = target. 

    def skipgram_iterator(self, sent_tokens, window_size):
        n = window_size * 2 + 1 
        for i, window in enumerate(per_window(sent_tokens, n)):
            window = list(window)
            target = window.pop(window_size)
            # Generate positive samples.
            for context_word in window:
                yield {"x": (target, context_word), "y": 1}
            # Generate negative samples.
            for _ in range(n-1):
                leftovers = sent_tokens[:i] + sent_tokens[i+n:]
                yield {"x": (target, random.choice(leftovers)), "y": 0}

<a id="section-3-1-3-hint"></a>
## Hints for the cell above.

In [40]:
# Option 1: To see the hint and partial code for the cell above, uncomment the following line.
##hint_word2vec_dataset()

# Option 2: "I give up just, run the code for me" 
# Uncomment the next two lines, if you really gave up... 
##full_code_word2vec_dataset()
##from tsundoku.word2vec import Word2VecText


<a id="section-3-1-4-hint"></a>

## 3.1.4. Train a CBOW model

### Lets Get Some Data

Lets take Kilgarriff (2005) , "Language is never ever, ever random". 

In [18]:
import os
import requests
import io #codecs


# Text version of https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
if os.path.isfile('language-never-random.txt'):
    with io.open('language-never-random.txt', encoding='utf8') as fin:
        text = fin.read()
else:
    url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
    text = requests.get(url).content.decode('utf8')
    with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
        fout.write(text)

tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]
window_size = 2
w2v_dataset = Word2VecText(tokenized_text, window_size=window_size, variant='cbow')
print(w2v_dataset[0])
print()
w2v_dataset_skipgram = Word2VecText(tokenized_text, window_size=window_size, variant='skipgram')
print(w2v_dataset_skipgram[0])

[{'x': [10, 8, 0, 7], 'y': 11}, {'x': [8, 11, 7, 0], 'y': 0}, {'x': [11, 0, 0, 7], 'y': 7}, {'x': [0, 7, 7, 0], 'y': 0}, {'x': [7, 0, 0, 13], 'y': 7}, {'x': [0, 7, 13, 3], 'y': 0}, {'x': [7, 0, 3, 9], 'y': 13}, {'x': [0, 13, 9, 2], 'y': 3}, {'x': [13, 3, 2, 10], 'y': 9}, {'x': [3, 9, 10, 15], 'y': 2}, {'x': [9, 2, 15, 11], 'y': 10}, {'x': [2, 10, 11, 5], 'y': 15}, {'x': [10, 15, 5, 16], 'y': 11}, {'x': [15, 11, 16, 14], 'y': 5}, {'x': [11, 5, 14, 0], 'y': 16}, {'x': [5, 16, 0, 4], 'y': 14}, {'x': [16, 14, 4, 10], 'y': 0}, {'x': [14, 0, 10, 8], 'y': 4}, {'x': [0, 4, 8, 6], 'y': 10}, {'x': [4, 10, 6, 12], 'y': 8}, {'x': [10, 8, 12, 1], 'y': 6}]

[{'x': (11, 10), 'y': 1}, {'x': (11, 8), 'y': 1}, {'x': (11, 0), 'y': 1}, {'x': (11, 7), 'y': 1}, {'x': (11, 3), 'y': 0}, {'x': (11, 11), 'y': 0}, {'x': (11, 7), 'y': 0}, {'x': (11, 0), 'y': 0}, {'x': (0, 8), 'y': 1}, {'x': (0, 11), 'y': 1}, {'x': (0, 7), 'y': 1}, {'x': (0, 0), 'y': 1}, {'x': (0, 9), 'y': 0}, {'x': (0, 6), 'y': 0}, {'x': (0, 6), 

In [42]:
print(text[:1000])

                       Language is never, ever, ever, random

                                                               ADAM KILGARRIFF




Abstract
Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in cor-
pora, the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a rela-
tion between two phenomena is demonstrably non-random, does not sup-
port the inference that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis test-
ing has been used, and show how it has often led to unhelpful or mislead-
ing results.
Keywords: 쎲쎲쎲

1. Int

In [43]:
# Sanity check, lets take a look at the data.
print(tokenized_text[0])

['language', 'is', 'never', ',', 'ever', ',', 'ever', ',', 'random', 'adam', 'kilgarriff', 'abstract', 'language', 'users', 'never', 'choose', 'words', 'randomly', ',', 'and', 'language', 'is', 'essentially', 'non-random', '.']


In [44]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [20]:
from lazyme import color_str

def visualize_predictions(x, y, prediction, vocab, window_size, unk='<unk>'):
    left = ' '.join([vocab.get(int(_x), '<unk>') for _x in x[:window_size]])
    right = ' '.join([vocab.get(int(_x), '<unk>') for _x in x[window_size:]])
    target = vocab.get(int(y), '<unk>')

    if not prediction:
        predicted_word = '______'
    else:
        predicted_word = vocab.get(int(prediction), '<unk>') 
    print(color_str(target, 'green'), '\t' if len(target) > 6 else '\t\t', 
          left, color_str(predicted_word, 'green' if target == predicted_word else 'red'), right)
    



<a id="section-3-1-4-cbow-model"></a>

## The CBOW Model

<img src="https://lilianweng.github.io/lil-log/assets/images/word2vec-cbow.png" width="500" align="left">


In [21]:
sent_idx = 10
window_size = 2
w2v_dataset = Word2VecText(tokenized_text, window_size=window_size, variant='cbow')
print(' '.join(w2v_dataset.sents[sent_idx]))
for w2v_io in w2v_dataset[sent_idx]:
    context, target = w2v_io['x'], w2v_io['y']
    context, target = tensor(context).to(device), tensor(target).to(device)
    visualize_predictions(context, target, None, w2v_dataset.vocab, window_size)

the bulk of linguistic questions concern the dis- tinction between a and m. a linguistic account of a phenomenon gen- erally gives us reason to view the relation between , for example , a verb ’ s syntax and its semantics , as motivated rather than arbitrary .


NameError: name 'device' is not defined

(Image from https://lilianweng.github.io/lil-log/2017/10/15/learning-word-embedding.html)

In [22]:
import torch
from torch import nn, optim, tensor, autograd
from torch.nn import functional as F

class CBOW(nn.Module):
    def __init__(self, vocab_size, embd_size, context_size, hidden_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
        self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inputs):
        # vocab_size: V
        # embed_size: E
        # hidden_size: H
        
        # shape: (, 4) ~~> (4, V) ==> via x weights:(V, E) ==>  (4, E)
        embedded = self.embeddings(inputs)
        
        # shape: (4, E) ==> via flatten ==> (1, 4*E)
        flatten_embed = embedded.view((1, -1))
        
        # shape: (1, 4*E) ==> via x weights:(4*E, H) ==> (1, H)
        hid = F.relu(self.linear1(flatten_embed)) 
        
        # shape: (1, H) ==> via x weights:(H, V) ==> (1, V)
        out = self.linear2(hid)
        
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


## Lets take a closer look from the inputs to the first `nn.Linear`

Cos after it reach the first `nn.Linear` it's just the same as our multi-layered perceptron example =)

In [47]:
w2v_dataset[0]

[{'x': [10, 8, 0, 7], 'y': 11},
 {'x': [8, 11, 7, 0], 'y': 0},
 {'x': [11, 0, 0, 7], 'y': 7},
 {'x': [0, 7, 7, 0], 'y': 0},
 {'x': [7, 0, 0, 13], 'y': 7},
 {'x': [0, 7, 13, 3], 'y': 0},
 {'x': [7, 0, 3, 9], 'y': 13},
 {'x': [0, 13, 9, 2], 'y': 3},
 {'x': [13, 3, 2, 10], 'y': 9},
 {'x': [3, 9, 10, 15], 'y': 2},
 {'x': [9, 2, 15, 11], 'y': 10},
 {'x': [2, 10, 11, 5], 'y': 15},
 {'x': [10, 15, 5, 16], 'y': 11},
 {'x': [15, 11, 16, 14], 'y': 5},
 {'x': [11, 5, 14, 0], 'y': 16},
 {'x': [5, 16, 0, 4], 'y': 14},
 {'x': [16, 14, 4, 10], 'y': 0},
 {'x': [14, 0, 10, 8], 'y': 4},
 {'x': [0, 4, 8, 6], 'y': 10},
 {'x': [4, 10, 6, 12], 'y': 8},
 {'x': [10, 8, 12, 1], 'y': 6}]

In [48]:
# Lets take a look at the first output.

x, y = w2v_dataset[0][0]['x'],  w2v_dataset[0][0]['y']

x = tensor(x)
y = autograd.Variable(tensor(y, dtype=torch.long))
# eqv to y = tensor(y, dtype=torch.long, requires_grad=True)
print(x)
print(y)

tensor([10,  8,  0,  7])
tensor(11)


In [49]:
embd_size = 16
emb = nn.Embedding(len(w2v_dataset.vocab), embd_size)
emb.state_dict()

OrderedDict([('weight',
              tensor([[ 0.7153,  1.7479, -0.6117,  ...,  1.0815, -1.7000, -0.6705],
                      [ 0.3753,  0.3314,  0.0829,  ...,  1.0841, -0.8687, -2.0626],
                      [ 0.8162,  1.1755, -1.8758,  ...,  0.2869,  1.5779, -0.2956],
                      ...,
                      [ 0.7069,  0.9629,  1.3972,  ..., -1.6224, -1.4650, -1.1592],
                      [-0.0302, -1.2826, -1.6638,  ...,  0.2575,  0.7669,  1.5370],
                      [-1.1147, -1.0100, -1.1918,  ..., -0.8127, -0.0152, -1.4089]]))])

In [50]:
print(emb.state_dict()['weight'].shape)
emb.state_dict()['weight']

torch.Size([1388, 16])


tensor([[ 0.7153,  1.7479, -0.6117,  ...,  1.0815, -1.7000, -0.6705],
        [ 0.3753,  0.3314,  0.0829,  ...,  1.0841, -0.8687, -2.0626],
        [ 0.8162,  1.1755, -1.8758,  ...,  0.2869,  1.5779, -0.2956],
        ...,
        [ 0.7069,  0.9629,  1.3972,  ..., -1.6224, -1.4650, -1.1592],
        [-0.0302, -1.2826, -1.6638,  ...,  0.2575,  0.7669,  1.5370],
        [-1.1147, -1.0100, -1.1918,  ..., -0.8127, -0.0152, -1.4089]])

In [51]:
print(emb(x).shape)
print(emb(x))

torch.Size([4, 16])
tensor([[-0.3253, -0.2608,  0.2228, -1.4508,  1.0638,  0.1867, -0.3235,  0.5421,
         -1.1007, -2.0327, -0.4239, -1.2057,  0.7376,  1.0613, -0.8919,  1.2731],
        [ 0.4798,  0.5355,  0.0711, -0.5138,  1.5813, -2.1590,  0.4064, -0.0569,
         -0.3886,  0.5179,  0.9036,  0.2085, -1.1335,  0.5126, -1.0820, -1.0251],
        [ 0.7153,  1.7479, -0.6117, -1.4266,  0.8099,  1.6911,  0.6661, -1.0740,
         -1.3789, -1.7196, -0.8043,  0.1148, -1.8357,  1.0815, -1.7000, -0.6705],
        [ 0.0192,  0.8208,  1.2063,  0.0301,  0.6522,  0.2502,  0.9660,  0.6276,
         -1.5130, -1.1698,  0.8151, -0.9970,  0.8429,  0.6015,  0.3918, -0.0344]],
       grad_fn=<EmbeddingBackward>)


In [52]:
print(emb(x).view(1, -1).shape)
emb(x).view(1, -1) # view() = reshape()

torch.Size([1, 64])


tensor([[-0.3253, -0.2608,  0.2228, -1.4508,  1.0638,  0.1867, -0.3235,  0.5421,
         -1.1007, -2.0327, -0.4239, -1.2057,  0.7376,  1.0613, -0.8919,  1.2731,
          0.4798,  0.5355,  0.0711, -0.5138,  1.5813, -2.1590,  0.4064, -0.0569,
         -0.3886,  0.5179,  0.9036,  0.2085, -1.1335,  0.5126, -1.0820, -1.0251,
          0.7153,  1.7479, -0.6117, -1.4266,  0.8099,  1.6911,  0.6661, -1.0740,
         -1.3789, -1.7196, -0.8043,  0.1148, -1.8357,  1.0815, -1.7000, -0.6705,
          0.0192,  0.8208,  1.2063,  0.0301,  0.6522,  0.2502,  0.9660,  0.6276,
         -1.5130, -1.1698,  0.8151, -0.9970,  0.8429,  0.6015,  0.3918, -0.0344]],
       grad_fn=<ViewBackward>)

In [53]:
hidden_size = 100
lin1 = nn.Linear(len(x)*embd_size, hidden_size)
print(lin1.state_dict())

OrderedDict([('weight', tensor([[-0.0506,  0.0021,  0.1154,  ..., -0.0767, -0.0202, -0.0624],
        [ 0.0939, -0.0023,  0.0396,  ...,  0.0817,  0.1098, -0.1113],
        [-0.1063,  0.1238,  0.0366,  ..., -0.1244, -0.0652, -0.0574],
        ...,
        [ 0.1086,  0.0232,  0.0480,  ...,  0.0869,  0.0604,  0.0619],
        [-0.1002, -0.0821,  0.0834,  ...,  0.0251, -0.0598,  0.1157],
        [ 0.0528, -0.0539,  0.0341,  ...,  0.0835, -0.0745,  0.0463]])), ('bias', tensor([ 5.1154e-02,  3.7107e-02,  1.0507e-01,  6.9028e-03,  1.1038e-01,
        -2.6856e-02, -2.9821e-02, -5.7821e-02, -1.1637e-02,  4.2438e-02,
         1.0412e-02,  6.3235e-02,  1.4381e-02, -1.2058e-01, -4.0786e-02,
        -2.5160e-02, -4.4839e-02, -1.0301e-01,  8.9467e-02,  1.2248e-01,
        -1.0799e-01, -4.5749e-02, -3.0971e-02, -1.1941e-01,  3.1865e-02,
        -8.2862e-02, -6.5516e-02, -3.9175e-04, -4.4868e-02, -5.2012e-02,
         1.2923e-02,  2.4054e-02,  3.5355e-02,  5.7571e-02,  2.6998e-03,
         9.4156e-02,

In [54]:
print(lin1.state_dict()['weight'].shape)
print(lin1.state_dict()['weight'])

torch.Size([100, 64])
tensor([[-0.0506,  0.0021,  0.1154,  ..., -0.0767, -0.0202, -0.0624],
        [ 0.0939, -0.0023,  0.0396,  ...,  0.0817,  0.1098, -0.1113],
        [-0.1063,  0.1238,  0.0366,  ..., -0.1244, -0.0652, -0.0574],
        ...,
        [ 0.1086,  0.0232,  0.0480,  ...,  0.0869,  0.0604,  0.0619],
        [-0.1002, -0.0821,  0.0834,  ...,  0.0251, -0.0598,  0.1157],
        [ 0.0528, -0.0539,  0.0341,  ...,  0.0835, -0.0745,  0.0463]])


In [55]:
print(lin1(emb(x).view(1, -1)).shape)
lin1(emb(x).view(1, -1))

torch.Size([1, 100])


tensor([[ 0.2689,  0.1491, -0.2944, -0.8299,  0.6464,  0.2149,  0.8135, -1.1905,
          1.1258,  0.8768,  0.0539, -0.6027,  0.3818,  0.4208,  0.5270, -0.8406,
         -0.7669, -0.2208, -0.5445,  0.3252, -0.1530,  0.3448,  0.1566,  0.5688,
          0.4407,  0.2625, -0.3181,  0.5758,  0.6550, -0.3085, -0.0901, -1.3622,
          0.3786, -0.9632,  0.2392, -0.3284, -1.2036,  0.3415, -0.5742,  0.8493,
         -0.2941,  0.4608,  0.1220,  0.4874, -0.5853,  0.5183, -0.0247, -0.8890,
         -1.3457, -0.3019,  0.0433,  0.3913,  0.8927,  0.8925, -0.3889, -0.5671,
         -0.4037, -0.6404,  0.1642, -1.4540, -0.7041,  0.3827,  0.6824,  0.1639,
          0.5283,  0.2287,  0.1899, -0.0336, -0.1974,  0.4755,  0.0246, -0.9769,
          0.3483, -0.7468, -1.1082, -0.2819, -0.8269, -0.4429, -0.6402, -0.2782,
          0.2357,  0.4178,  0.2248, -0.1553,  0.6363, -0.6831, -0.0275, -0.8776,
         -0.5874,  1.0407,  0.0969, -0.1305, -1.4278, -0.5536, -0.0446, -0.7551,
         -0.2387,  0.4420,  

In [56]:
relu = nn.ReLU()
print(relu(lin1(emb(x).view(1, -1))).shape)
relu(lin1(emb(x).view(1, -1)))

torch.Size([1, 100])


tensor([[0.2689, 0.1491, 0.0000, 0.0000, 0.6464, 0.2149, 0.8135, 0.0000, 1.1258,
         0.8768, 0.0539, 0.0000, 0.3818, 0.4208, 0.5270, 0.0000, 0.0000, 0.0000,
         0.0000, 0.3252, 0.0000, 0.3448, 0.1566, 0.5688, 0.4407, 0.2625, 0.0000,
         0.5758, 0.6550, 0.0000, 0.0000, 0.0000, 0.3786, 0.0000, 0.2392, 0.0000,
         0.0000, 0.3415, 0.0000, 0.8493, 0.0000, 0.4608, 0.1220, 0.4874, 0.0000,
         0.5183, 0.0000, 0.0000, 0.0000, 0.0000, 0.0433, 0.3913, 0.8927, 0.8925,
         0.0000, 0.0000, 0.0000, 0.0000, 0.1642, 0.0000, 0.0000, 0.3827, 0.6824,
         0.1639, 0.5283, 0.2287, 0.1899, 0.0000, 0.0000, 0.4755, 0.0246, 0.0000,
         0.3483, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2357,
         0.4178, 0.2248, 0.0000, 0.6363, 0.0000, 0.0000, 0.0000, 0.0000, 1.0407,
         0.0969, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.4420, 0.1143,
         0.0000]], grad_fn=<ReluBackward0>)

In [57]:
lin2 = nn.Linear(hidden_size, len(w2v_dataset.vocab))
print(lin2.state_dict()['weight'].shape)
lin2.state_dict()['weight']

torch.Size([1388, 100])


tensor([[-0.0433, -0.0022,  0.0363,  ...,  0.0138, -0.0349, -0.0199],
        [-0.0478,  0.0824, -0.0099,  ...,  0.0979,  0.0941, -0.0290],
        [-0.0352,  0.0961, -0.0827,  ...,  0.0174,  0.0876,  0.0253],
        ...,
        [ 0.0594,  0.0685,  0.0564,  ..., -0.0227, -0.0058,  0.0499],
        [-0.0168,  0.0030,  0.0279,  ...,  0.0057,  0.0448,  0.0037],
        [ 0.0339, -0.0923,  0.0680,  ..., -0.0628,  0.0379, -0.0388]])

In [58]:
h_x = relu(lin1(emb(x).view(1, -1)))
print(lin2(h_x).shape)
lin2(h_x)

torch.Size([1, 1388])


tensor([[ 0.0038,  0.1834,  0.1476,  ..., -0.3712, -0.0427,  0.0289]],
       grad_fn=<AddmmBackward>)

In [59]:
softmax = nn.LogSoftmax(dim=1)
softmax(lin2(h_x)).detach().numpy().tolist()

[[-7.252162933349609,
  -7.072550296783447,
  -7.108374118804932,
  -7.20729398727417,
  -7.617002964019775,
  -7.630112171173096,
  -7.162570476531982,
  -6.918042182922363,
  -7.005789279937744,
  -7.161597728729248,
  -7.170647621154785,
  -7.653318881988525,
  -7.321627140045166,
  -7.838001251220703,
  -7.375617504119873,
  -7.493913650512695,
  -7.183315753936768,
  -7.60685920715332,
  -7.1636881828308105,
  -7.343142986297607,
  -7.628304958343506,
  -7.375884056091309,
  -7.088252544403076,
  -7.174910545349121,
  -7.156351089477539,
  -7.492038249969482,
  -6.958305358886719,
  -7.8140106201171875,
  -7.453192710876465,
  -7.362573146820068,
  -7.268645286560059,
  -7.407510757446289,
  -7.3799357414245605,
  -7.364015579223633,
  -7.284485816955566,
  -7.174019813537598,
  -7.6030592918396,
  -7.623497486114502,
  -7.058132171630859,
  -7.456703186035156,
  -7.244445323944092,
  -7.475480079650879,
  -7.380141735076904,
  -7.039546489715576,
  -7.495331287384033,
  -7.069267

In [60]:
# Select the index with highest softmax probabilities
# See https://pytorch.org/docs/stable/torch.html#torch.max
torch.max(softmax(lin2(h_x)), 1)

torch.return_types.max(
values=tensor([-6.5896], grad_fn=<MaxBackward0>),
indices=tensor([1241]))

<a id="section-3-1-4-train-cbow"></a>

# Now, we train the CBOW model for real.

In [23]:
# First we split the data into training and testing.
from sklearn.model_selection import train_test_split

tokenized_text_train, tokenized_text_test = train_test_split(tokenized_text, test_size=0.1, random_state=42)
len(tokenized_text_train), len(tokenized_text_test)

(211, 24)

In [62]:
### Hint: Click here to go back up to see the CBOW model

In [24]:
import torch
from torch import nn, optim, tensor, autograd
from torch.nn import functional as F

class CBOW(nn.Module):
    def __init__(self, vocab_size, embd_size, context_size, hidden_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
        self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inputs):
        # Put the input context indices into the embeddings
        # then squeeze it into a single dimension vector with tensor.view((1,-1))
        embedded = self.embeddings(inputs).view((1, -1))
        # Put the embedding input through linear layer,
        # then an activation function to create the hidden layer.
        hid = F.relu(self.linear1(embedded))
        # Put the hidden layer through a second linear layer,
        out = self.linear2(hid)
        # then a last layer activation function to generate
        # pobabilities, hint https://pytorch.org/docs/stable/nn.html#torch.nn.functional.log_softmax
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


In [32]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [64]:
embd_size = 100
learning_rate = 0.003
hidden_size = 100
window_size = 2


# Initialize the dataset.
w2v_dataset = Word2VecText(tokenized_text_train, window_size=window_size, variant='cbow')
vocab_size = len(w2v_dataset.vocab)

criterion = nn.NLLLoss()
# Hint: the CBOW model object you've created.
model = CBOW(vocab_size, embd_size, window_size, hidden_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

losses = []

model = nn.DataParallel(model) # copy data across multiple GPUs

num_epochs = 10
for _e in tqdm(range(num_epochs)):
    epoch_loss = []
    epoch_counter = 0
    for sent_idx in range(w2v_dataset._len):
        for w2v_io in w2v_dataset[sent_idx]:
            # Zero gradient.
            optimizer.zero_grad()
            # Retrieve the inputs and outputs.
            x, y = w2v_io['x'], w2v_io['y']
            x = tensor(x).to(device)
            y = autograd.Variable(tensor(y, dtype=torch.long)).to(device)
            # Calculate the log probability of the context embeddings.
            logprobs = model(x)
            # This unsqueeze thing is really a feature/bug... -_-
            loss = criterion(logprobs, y.unsqueeze(0)) 
            loss.backward()
            optimizer.step()
            epoch_loss.append(float(loss))
            
            if epoch_counter % 100 == 0:
                print(loss)
            epoch_counter = epoch_counter + 1
            
    # Save model after every epoch.
    torch.save(model.state_dict(), 'cbow_checkpoint_{}.pt'.format(_e))
    losses.append(sum(epoch_loss)/len(epoch_loss))



  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

cuda
tensor(7.2880, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.2481, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0015, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.2383, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.4868, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(13.2742, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.4893, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(17.4191, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.0035, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.4675, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.6830, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.9785, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.9846, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0008, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(9.1475, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.4502, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.8355, device='cuda:0', grad_fn=<NllLossB

 10%|████████▎                                                                          | 1/10 [00:15<02:23, 16.00s/it]

tensor(4.5573, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.4972, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.2016, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.1085, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.3786, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.7970, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.7664, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.8191, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.7413, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.6256, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8382, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.0935, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.1851, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.6616, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0029, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.7957, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8017, device='cuda:0', grad_fn=<NllLossBackward

 20%|████████████████▌                                                                  | 2/10 [00:31<02:07, 15.98s/it]

tensor(6.7726, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.5196, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0203, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.4982, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.4213, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.4631, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.4438, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.2101, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.5788, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.0100, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.6491, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0031, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.7385, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.8456, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(9.6321e-05, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.3378, device='cuda:0', grad_fn=<NllLossBack

 30%|████████████████████████▉                                                          | 3/10 [00:48<01:52, 16.10s/it]

tensor(6.8801, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.8718, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.7388, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0301, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.4844, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.3913, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.7721, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.4260, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(10.0202, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.5772, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.1359, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.7035, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0073, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.8136, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.8948, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackwar

 40%|█████████████████████████████████▏                                                 | 4/10 [01:03<01:35, 15.92s/it]

tensor(4.5444, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.9136, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.8711, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0029, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.3632, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.1580, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.2308, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.7648, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.8772, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.6004, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.2301, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.4371, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.1653, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.8878, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.9490, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.2635, device='cuda:0', grad_fn=<NllLossBackward

 50%|█████████████████████████████████████████▌                                         | 5/10 [01:20<01:20, 16.09s/it]

tensor(4.0037, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.9473, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.9635, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0141, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.3728, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.0261, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.1292, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.9493, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.6239, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.2980, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.7419, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0269, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.9509, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.9946, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tenso

 60%|█████████████████████████████████████████████████▊                                 | 6/10 [01:35<01:03, 15.90s/it]

tensor(4.3745, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.9769, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.0393, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0073, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.2938, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.7382, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0186, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.4577, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.0136, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.6565, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.3611, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.6481, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0034, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.0113, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(9.5367e-06, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.0408, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward

 70%|██████████████████████████████████████████████████████████                         | 7/10 [01:52<00:48, 16.01s/it]

tensor(4.2959, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.0032, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.1050, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.1176, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.2557, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.8627, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.2376, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.0688, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.6892, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.4140, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.7368, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0213, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.0639, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.9073e-06, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.0861, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward

 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [02:06<00:31, 15.70s/it]

tensor(4.2312, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.0650, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.1565, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0308, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.2090, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(1.5575, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.6294e-06, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.0809, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.1130, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.7168, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.4554, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.7077, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0042, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.1072, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0022, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.1250, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward

 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [02:26<00:16, 16.93s/it]

tensor(7.0959, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.1968, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0141, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(5.1631, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(2.1679, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0014, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(3.3257, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.1538, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.7416, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(7.4919, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(4.6641, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.1446, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(8.1593, device='cuda:0', grad_fn=<NllLossBackward>)
tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
tensor(6.3901, device='cuda:0', grad_fn=<NllLossBackward>)
tenso

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [02:43<00:00, 16.39s/it]


In [26]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
sns.set_style("darkgrid")
sns.set(rc={'figure.figsize':(12, 8)})

plt.plot(losses)
plt.show()

<a id="section-3-1-4-evaluate-cbow"></a>

# Apply and Evaluate the CBOW Model 

In [66]:
w2v_dataset.vocab[0]

','

In [27]:
from lazyme import color_str

true_positive = 0
all_data = 0
# Iterate through the test sentences. 
for sent in tokenized_text_test:
    # Extract all the CBOW contexts (X) and targets (Y)
    for w2v_io in w2v_dataset._iterator(w2v_dataset.vectorize(sent)):
        # Retrieve the inputs and outputs.
        x = tensor(w2v_io['x']).to(device)
        y = tensor(w2v_io['y']).to(device)
        if -1 in x: # Skip unknown words.
            continue
            
        with torch.no_grad():
            # Remember how to get the best prediction output? 
            # Hint: https://pytorch.org/docs/stable/torch.html#torch.max
            _, prediction =  torch.max(model(x), 1)
        true_positive += int(prediction) == int(y)
        visualize_predictions(x, y, prediction, w2v_dataset.vocab, window_size=window_size)
        all_data += 1

NameError: name 'device' is not defined

In [68]:
torch.max(model(x), 1).values
#model(x)

tensor([-1.9573], device='cuda:0', grad_fn=<MaxBackward0>)

In [69]:
print('Accuracy:', true_positive/all_data)

Accuracy: 0.15319148936170213


<a id="section-3-1-4-load-model"></a>

# Go back to the 5th Epoch

In [70]:
model_5 = CBOW(vocab_size, embd_size, window_size, hidden_size)
model_5 = torch.nn.DataParallel(model_5)
model_5.load_state_dict(torch.load('cbow_checkpoint_5.pt'))
model_5.eval()

DataParallel(
  (module): CBOW(
    (embeddings): Embedding(1303, 100)
    (linear1): Linear(in_features=400, out_features=100, bias=True)
    (linear2): Linear(in_features=100, out_features=1303, bias=True)
  )
)

In [71]:

true_positive = 0
all_data = 0
# Iterate through the test sentences. 
for sent in tokenized_text_test:
    # Extract all the CBOW contexts (X) and targets (Y)
    for w2v_io in w2v_dataset._iterator(w2v_dataset.vectorize(sent)):
        # Retrieve the inputs and outputs.
        x = tensor(w2v_io['x']).to(device)
        y = tensor(w2v_io['y']).to(device)
        
        if -1 in x: # Skip unknown words.
            continue
            
        with torch.no_grad():
            _, prediction =  torch.max(model_5(x), 1)
        true_positive += int(prediction) == int(y)
#         print(prediction)
        visualize_predictions(x, y, prediction, w2v_dataset.vocab, window_size=window_size)
        all_data += 1

[92mis[0m 		 the problem [91min[0m essentially this
[92messentially[0m 	 problem is [91m______[0m this :
[92mthis[0m 		 is essentially [91m______[0m : if
[92m:[0m 		 essentially this [91m______[0m if a
[92mif[0m 		 this : [91m______[0m a word
[92ma[0m 		 : if [91m______[0m word (
[92mword[0m 		 if a [91m______[0m ( or
[92m([0m 		 a word [91m______[0m or bigram
[92mor[0m 		 word ( [91m______[0m bigram ,
[92mbigram[0m 		 ( or [91m______[0m , or
[92m<unk>[0m 		 , or [91m______[0m , or
[92m<unk>[0m 		 , or [91m______[0m etc .
[92mis[0m 		 the web [91m______[0m a vast
[92ma[0m 		 web is [91mthe[0m vast re-
[92mvast[0m 		 is a [91m______[0m re- source
[92mre-[0m 		 a vast [91m______[0m source for
[92msource[0m 		 vast re- [91m______[0m for many
[92mthe[0m 		 is that [91m______[0m association is
[92massociation[0m 	 that the [91m______[0m is random
[92mis[0m 		 the association [91m______[0m random ,
[92mrandom[0

[92mand[0m 		 generated , [91m______[0m the hypothesis
[92mthe[0m 		 , and [91m______[0m hypothesis test
[92mhypothesis[0m 	 and the [91m______[0m test con-
[92m<unk>[0m 		 test con- [91mis[0m the fact
[92m<unk>[0m 		 cases are [91m______[0m in section
[92m<unk>[0m 		 of linguistic [91m______[0m concern the
[92m<unk>[0m 		 the dis- [91m______[0m between a
[92m<unk>[0m 		 a and [91m______[0m a linguistic
[92m<unk>[0m 		 a linguistic [91m______[0m of a
[92m<unk>[0m 		 reason to [91m______[0m the relation
[92mbetween[0m 	 the relation [91m______[0m , for
[92m,[0m 		 relation between [91m______[0m for example
[92mfor[0m 		 between , [91m______[0m example ,
[92mexample[0m 	 , for [91m______[0m , a
[92m,[0m 		 for example [91mthe[0m a verb
[92ma[0m 		 example , [91mever[0m verb ’
[92mverb[0m 		 , a [91m______[0m ’ s
[92m’[0m 		 a verb [92m’[0m s syntax
[92ms[0m 		 verb ’ [91mis[0m syntax and
[92msyntax[0m 		 ’ s [

In [72]:
print('Accuracy:', true_positive/all_data)

Accuracy: 0.13617021276595745


In [73]:
model(x).shape[1] == len(w2v_dataset.vocab)

True

# [optional] How to Handle Unknown Words? 

This is not the best way to handle unknown words, but we can simply assign an index for unknown words.

**Hint:** Ensure that you have `gensim` version >= 3.7.0 first. Otherwise this part of the code won't work. 

Try in your Python environment installation:

```
python -m pip install -U pip
python -m pip install -U gensim>=3.7.0
```

Or within the jupyter notebook:

In [28]:
import gensim
gensim.__version__

'3.8.1'

In [75]:
!python -m pip install -U pip
!python -m pip install -U gensim>=3.7.0

Requirement already up-to-date: pip in d:\apps\anaconda3\envs\torch-nlp\lib\site-packages (19.3.1)


To check version of `gensim` after installation:

In [76]:
vocab = Dictionary(['this is a foo bar sentence'.split()])
dict(vocab.items())

{0: 'a', 1: 'bar', 2: 'foo', 3: 'is', 4: 'sentence', 5: 'this'}

In [77]:
# See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.patch_with_special_tokens
vocab = Dictionary(['this is a foo bar sentence'.split()])

try:
    special_tokens = {'<pad>': 0, '<unk>': 1}
    vocab.patch_with_special_tokens(special_tokens)
except: # If gensim is not 3.7.0
    pass
    
dict(vocab.items())

{6: 'a',
 7: 'bar',
 2: 'foo',
 3: 'is',
 4: 'sentence',
 5: 'this',
 0: '<pad>',
 1: '<unk>'}

# [optional] Lets Rewrite the `Word2VecText` Object

Now with the (i) unknown word patch in the vocabulary as well as (ii) `skipgram_iterator`

In [29]:
class Word2VecText(Dataset):
    def __init__(self, tokenized_texts, window_size, variant):
        """
        :param tokenized_texts: Tokenized text.
        :type tokenized_texts: list(list(str))
        """
        self.sents = tokenized_texts
        self._len = len(self.sents)
        
        # Add the unknown word patch here.
        self.vocab = Dictionary(self.sents)
        try:
            special_tokens = {'<pad>': 0, '<unk>': 1}
            self.vocab.patch_with_special_tokens(special_tokens)
        except:
            pass
        
        self.window_size = window_size
        self.variant = variant
        if variant.lower() == 'cbow':
            self._iterator = partial(self.cbow_iterator, window_size=self.window_size)
        elif variant.lower() == 'skipgram':
            self._iterator = partial(self.skipgram_iterator, window_size=self.window_size)

    def __getitem__(self, index):
        """
        The primary entry point for PyTorch datasets.
        This is were you access the specific data row you want.
        
        :param index: Index to the data point.
        :type index: int
        """
        vectorized_sent = self.vectorize(self.sents[index])
        
        return list(self._iterator(vectorized_sent))

    def __len__(self):
        return self._len
    
    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        return self.vocab.doc2idx(tokens, unknown_word_index=1)
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]
    
    def cbow_iterator(self, tokens, window_size):
        n = window_size * 2 + 1
        for window in per_window(tokens, n):
            target = window.pop(window_size)
            yield {'x': window, 'y': target}   # X = window ; Y = target. 
            
    def skipgram_iterator(self, tokens, window_size):
        n = window_size * 2 + 1 
        for i, window in enumerate(per_window(tokens, n)):
            focus = window.pop(window_size)
            # Generate positive samples.
            for context_word in window:
                yield {'x': (focus, context_word), 'y':1}
            # Generate negative samples.
            for _ in range(n-1):
                leftovers = tokens[:i] + tokens[i+n:]
                if leftovers:
                    yield {'x': (focus, random.choice(leftovers)), 'y':0}
                

<a id="section-3-1-5"></a>

# Lets try the skipgram task

In [30]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embd_size):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
    
    def forward(self, x_focus, x_context):
        # vocab_size: V
        # embed_size: N
        # hidden_size: H
        
        # shape: (, 1) ~~> (1, V) ==> via x weights:(V, N) ==>  (1, N)
        embed_focus = self.embeddings(x_focus).view((1, -1))
        
        # shape: (, 1) ~~> (1, V) ==> via x weights:(V, N) ==>  (1, N)
        embed_context = self.embeddings(x_context).view((1, -1))
        
        # See https://pytorch.org/docs/stable/torch.html#torch.t
        # shape: (1, N), (1, N) ==> via (1,N) x T(1,N) ==> (, 1)
        dotprod_score = torch.mm(embed_focus, torch.t(embed_context))
        log_probs = F.logsigmoid(dotprod_score)
        return log_probs

<a id="section-3-1-5-foward"></a>

# Take a closer look at what's in the `forward()`

In [80]:
xx1 = torch.rand(1,20)
xx2 = torch.rand(1,20)

xx1_numpy = xx1.detach().numpy()
xx2_numpy = xx2.detach().numpy()

In [81]:
print(xx1_numpy.shape)
print(xx2_numpy.T.shape)
print(np.dot(xx1_numpy, xx2_numpy.T))

(1, 20)
(20, 1)
[[6.1949725]]


In [82]:
print(xx1.shape)
print(torch.t(xx2).shape) 

print(torch.mm(xx1, torch.t(xx2))) # 
print(torch.mm(torch.t(xx1), xx2).shape) # 

torch.Size([1, 20])
torch.Size([20, 1])
tensor([[6.1950]])
torch.Size([20, 20])


<a id="section-3-1-5-train"></a>

# Train a Skipgram model (for real)

In [33]:
embd_size = 100
learning_rate = 0.03
hidden_size = 300
window_size = 3

# Initialize the dataset.
w2v_skipgram_dataset = Word2VecText(tokenized_text_train, window_size=3, variant='skipgram')
vocab_size = len(w2v_skipgram_dataset.vocab)

criterion = nn.MSELoss()
# Use the Skipgram object
model = SkipGram(vocab_size, embd_size).to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

losses = []

model = nn.DataParallel(model)

num_epochs = 6
for _e in tqdm(range(num_epochs)):
    epoch_loss = []
    epoch_count = 0
    for sent_idx in range(w2v_skipgram_dataset._len):
        for w2v_io in w2v_skipgram_dataset[sent_idx]:
            # Retrieve the inputs and outputs.
            x_focus, x_context = w2v_io['x']
            x_focus, x_context = tensor(x_focus).to(device).view((1,1)), tensor(x_context).to(device).view((1,1))
            y = autograd.Variable(tensor(w2v_io['y'], dtype=torch.float)).to(device)
            # Zero gradient.
            model.zero_grad()
            # Calculate the log probability of the context embeddings.
            logprobs = model(x_focus, x_context)
            # This unsqueeze thing is really a feature/bug... -_-
            loss = criterion(logprobs, y.unsqueeze(0)) 
            loss.backward()
            optimizer.step()
            epoch_loss.append(float(loss))
            
            if epoch_count % 100 == 0:
                print(loss)
            epoch_count += 1
            
    torch.save(model.state_dict(), 'skipgram_checkpoint_{}.pt'.format(_e))
    losses.append(sum(epoch_loss)/len(epoch_loss))



  return F.mse_loss(input, target, reduction=self.reduction)


tensor(316.7239, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(1., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(0., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(1., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(1., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(0., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(1., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(1., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(0., device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackwa

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

 17%|██████████████                                                                      | 1/6 [01:16<06:22, 76.42s/it]

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

 33%|████████████████████████████                                                        | 2/6 [02:32<05:05, 76.41s/it]

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

 50%|██████████████████████████████████████████                                          | 3/6 [03:48<03:48, 76.26s/it]

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

 67%|████████████████████████████████████████████████████████                            | 4/6 [05:04<02:32, 76.25s/it]

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

 83%|██████████████████████████████████████████████████████████████████████              | 5/6 [06:21<01:16, 76.29s/it]

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)
tensor(nan, device='cuda:0', grad_fn=<MseLossBac

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [07:35<00:00, 75.98s/it]

tensor(nan, device='cuda:0', grad_fn=<MseLossBackward>)





In [38]:
all_data

5076

<a id="section-3-1-5-evaluate"></a>

# Evaluate the model on the skipgram task

In [34]:

true_positive = 0
all_data = 0
# Iterate through the test sentences. 
for sent in tokenized_text_test:
    # Extract all the CBOW contexts (X) and targets (Y)
    for w2v_io in w2v_skipgram_dataset._iterator(w2v_skipgram_dataset.vectorize(sent)):
        model.zero_grad()
        # Retrieve the inputs and outputs.
        x1, x2 = w2v_io['x']
        x1, x2 = tensor(x1).view((1,1)).to(device), tensor(x2).view((1,1)).to(device)
        y = w2v_io['y']
        _, prediction =  torch.max(model(x1, x2), 1)    
        true_positive += int(prediction) == int(y)
        all_data += 1

In [35]:
print('Accuracy:', true_positive/all_data)

Accuracy: 0.5


## Download the Collobert and Weston SENNA Embeddings


If you're on a Mac or Linux, you can use the `!` bang commands in the next cell to get the data.

```
!pip install kaggle
!mkdir -p .kaggle
!echo '{"username":"natgillin","key":"54ae95ab760b52c3307ed4645c6c9b5d"}' > .kaggle/kaggle.json
!chmod 600 .kaggle/kaggle.json
!kaggle datasets download -d alvations/vegetables-senna-embeddings --force -p ./
```

If you're on windows go to https://www.kaggle.com/alvations/vegetables-senna-embeddings and download the data files. 

What's most important are the 
 - `.txt` file that contains the vocabulary list
 - `.npy` file that contains the binarized numpy array
 
The rows of the numpy array corresponds to the vocabulary in the order from the `.txt` file.

<a id="section-3-1-6-vocab"></a>


## 3.1.6. Loading Pre-trained Embeddings

Lets overwrite the `Word2VecText` object with the pretrained embeddings. 

Most important thing is the overwrite the `Dictionary` from `gensim` with the vocabulary of the pre-trained embeddings, as such:

```python
        # Loads the pretrained keys. 
        with open('senna.wiki-reuters.lm2.50d.txt') as fin:
            pretrained_keys = {line.strip():i for i, line in enumerate(fin)}
        self.vocab = Dictionary({})
        self.vocab.token2id = pretrained_keys
```


In [46]:
pretrained_dir = "D:/projects/tsundoku-master/completed/"

class Word2VecText(Dataset):
    def __init__(self, tokenized_texts, window_size, variant):
        """
        :param tokenized_texts: Tokenized text.
        :type tokenized_texts: list(list(str))
        """
        self.sents = tokenized_texts
        self._len = len(self.sents)
        
        # Loads the pretrained keys. 
        with open(pretrained_dir + 'senna.wiki-reuters.lm2.50d.txt') as fin:
            pretrained_keys = {line.strip():i for i, line in enumerate(fin)}
        self.vocab = Dictionary({})
        self.vocab.token2id = pretrained_keys
        
        self.window_size = window_size
        self.variant = variant
        if variant.lower() == 'cbow':
            self._iterator = partial(self.cbow_iterator, window_size=self.window_size)
        elif variant.lower() == 'skipgram':
            self._iterator = partial(self.skipgram_iterator, window_size=self.window_size)

    def __getitem__(self, index):
        """
        The primary entry point for PyTorch datasets.
        This is were you access the specific data row you want.
        
        :param index: Index to the data point.
        :type index: int
        """
        vectorized_sent = self.vectorize(self.sents[index])
        
        return list(self._iterator(vectorized_sent))

    def __len__(self):
        return self._len
    
    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        return self.vocab.doc2idx(tokens, unknown_word_index=-1)
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]
    
    def cbow_iterator(self, tokens, window_size):
        n = window_size * 2 + 1
        for window in per_window(tokens, n):
            target = window.pop(window_size)
            yield {'x': window, 'y': target}   # X = window ; Y = target. 
            
    def skipgram_iterator(self, tokens, window_size):
        n = window_size * 2 + 1 
        for i, window in enumerate(per_window(tokens, n)):
            focus = window.pop(window_size)
            # Generate positive samples.
            for context_word in window:
                yield {'x': (focus, context_word), 'y':1}
            # Generate negative samples.
            for _ in range(n-1):
                leftovers = tokens[:i] + tokens[i+n:]
                if leftovers:
                    yield {'x': (focus, random.choice(leftovers)), 'y':0}
                

<a id="section-3-1-6-pretrained"></a>

## Override the embeddings layer with the pre-trained weights.

In PyTorch, the weights of the `nn.Embedding` object can be easily overwritten with `from_pretrained` function, see https://pytorch.org/docs/stable/nn.html#embedding

In [55]:
class SkipGram(nn.Module):
    def __init__(self, pretrained_npy):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(pretrained_npy)
    
    def forward(self, focus, context):
        # Put the index of the focus word into the embedding layer.
        embed_focus = self.embeddings(focus).view((1, -1))
        # Put the index of the context word into the embedding layer.
        embed_context = self.embeddings(context).view((1, -1))
        # See https://pytorch.org/docs/stable/torch.html#torch.t
        # Do a matrix multiplication between the focus and context embedding
        score = torch.mm(embed_focus, torch.t(embed_context))
        # Then put it through a log sigmoid activation function
        # so that the output is between (log(0), log(1))
        log_probs = F.logsigmoid(score)
        return log_probs

In [56]:
np.load(pretrained_dir + 'senna.wiki-reuters.lm2.50d.npy')

array([[-1.03682 ,  1.77856 , -0.693547, ..., -0.10278 , -0.36428 ,
        -0.64853 ],
       [-2.19067 ,  1.16642 , -1.91385 , ...,  0.870654, -0.33808 ,
        -0.41957 ],
       [ 1.16672 ,  0.811884, -0.115492, ..., -0.104843,  2.26862 ,
         1.21729 ],
       ...,
       [-0.483488,  2.00359 ,  0.186266, ..., -0.114528,  1.50755 ,
        -1.25606 ],
       [ 0.201604,  1.15796 ,  0.888882, ..., -1.28183 ,  0.465847,
        -1.57974 ],
       [-0.238824,  0.443876,  0.290836, ..., -0.802705, -0.318169,
        -1.4733  ]])

In [59]:
w2v_skipgram_dataset = Word2VecText(tokenized_text_train, window_size=window_size, variant='skipgram')
pretrained_npy = torch.tensor(np.load(pretrained_dir + 'senna.wiki-reuters.lm2.50d.npy'))
pretrained_model = SkipGram(pretrained_npy).to(device)

<a id="section-3-1-6-eval-skipgram"></a>
## Test Pretrained Embeddings on the Skipgram Task

In [60]:
true_positive = 0
all_data = 0
# Iterate through the test sentences. 
for sent in tokenized_text_test:
    # Extract all the CBOW contexts (X) and targets (Y)
    for w2v_io in w2v_skipgram_dataset._iterator(w2v_skipgram_dataset.vectorize(sent)):
        pretrained_model.zero_grad()
        # Retrieve the inputs and outputs.
        x1, x2 = w2v_io['x']
        if -1 in (x1, x2): # Skip unknown words.
            continue
        x1, x2 = tensor(x1).to(device), tensor(x2).to(device)
        y = w2v_io['y']
        with torch.no_grad():
            logprobs = pretrained_model(x1, x2)
            _, prediction =  torch.max(logprobs, 1)    
        true_positive += int(prediction) == int(y)
        all_data += 1

In [62]:
with open(pretrained_dir + 'senna.wiki-reuters.lm2.50d.txt') as fin:
    pretrained_keys = {line.strip():i for i, line in enumerate(fin)}

In [63]:
print('Accuracy:', true_positive/all_data)

Accuracy: 0.5001212121212121


<a id="section-3-1-6-eval-cbow"></a>
## Test Pretrained Embeddings on the CBOW Task

In [64]:
class CBOW(nn.Module):
    def __init__(self, pretrained_npy, context_size, hidden_size):
        super(CBOW, self).__init__()
        vocab_size, embd_size = list(pretrained_npy.shape)
        self.embeddings = nn.Embedding.from_pretrained(pretrained_npy)
        self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inputs):
        embedded = self.embeddings(inputs).float().view((1, -1))
        hid = F.relu(self.linear1(embedded))
        out = self.linear2(hid)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


In [67]:
window_size = 5
w2v_dataset = Word2VecText(tokenized_text_train, window_size=window_size, variant='cbow')
hidden_size = 300
pretrained_cbow_model = CBOW(pretrained_npy, window_size, hidden_size).to(device)

In [68]:

true_positive = 0
all_data = 0
# Iterate through the test sentences. 
for sent in tokenized_text_test:
    # Extract all the CBOW contexts (X) and targets (Y)
    for w2v_io in w2v_dataset._iterator(w2v_dataset.vectorize(sent)):
        # Retrieve the inputs and outputs.
        x = tensor(w2v_io['x']).to(device)
        y = tensor(w2v_io['y']).to(device)
        
        if -1 in x: # Skip unknown words.
            continue
        with torch.no_grad():
            _, prediction =  torch.max(pretrained_cbow_model(x), 1)
        true_positive += int(prediction) == int(y)
        visualize_predictions(x, y, prediction, w2v_dataset.vocab, window_size=window_size)
        all_data += 1

[92m:[0m 		 the problem is essentially this [91mmuresan[0m if a word ( or
[92mre-[0m 		 the web is a vast [91mmladen[0m source for many languages .
[92mrandom[0m 		 is that the association is [91mabiomed[0m , arbitrary , motivated or
[92m,[0m 		 that the association is random [91mmotorcade[0m arbitrary , motivated or pre-
[92m<unk>[0m 		 arbitrary , motivated or pre- [91maccounts[0m ( r , a ,
[92minevitably[0m 	 however , their methods are [91munwrapped[0m noisy , suffering , for
[92mnoisy[0m 		 , their methods are inevitably [91mreasonable[0m , suffering , for example
[92m,[0m 		 their methods are inevitably noisy [91mhm[0m suffering , for example ,
[92msuffering[0m 	 methods are inevitably noisy , [91mheadrests[0m , for example , from
[92m,[0m 		 are inevitably noisy , suffering [91mrehearsals[0m for example , from just
[92mfor[0m 		 inevitably noisy , suffering , [91mfarr[0m example , from just those
[92mexample[0m 	 noisy , suffering , 

[92mour[0m 		 random , we know that [91minvoicing[0m corpora are not randomly generated
[92mcorpora[0m 	 , we know that our [91mwicht[0m are not randomly generated ,
[92mare[0m 		 we know that our corpora [91mtraditional[0m not randomly generated , and
[92mnot[0m 		 know that our corpora are [91mtartarstan[0m randomly generated , and the
[92mrandomly[0m 	 that our corpora are not [91mamyrlin[0m generated , and the hypothesis
[92mgenerated[0m 	 our corpora are not randomly [91mworshipping[0m , and the hypothesis test
[92mthe[0m 		 gives us reason to view [91mefan[0m relation between , for example
[92mrelation[0m 	 us reason to view the [91msaur[0m between , for example ,
[92mbetween[0m 	 reason to view the relation [91mtld[0m , for example , a
[92m,[0m 		 to view the relation between [91mtap[0m for example , a verb
[92m<unk>[0m 		 for example , a verb [91mprospekt[0m s syntax and its semantics
[92m,[0m 		 s syntax and its semantics [91mdemo

In [None]:
print('Accuracy:', true_positive/all_data)

<a id="section-3-1-6-unfreeze-finetune"></a>
## Unfreeze the Embedddings and Tune it on the CBOW Task

In [None]:
class CBOW(nn.Module):
    def __init__(self, pretrained_npy, context_size, hidden_size):
        super(CBOW, self).__init__()
        vocab_size, embd_size = list(pretrained_npy.shape)
        # See https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding.from_pretrained
        # Note the `freeze=False`, by default if you use `nn.Embedding.from_pretrained(),
        # `freeze` is set to True
        self.embeddings = nn.Embedding.from_pretrained(pretrained_npy, freeze=False)
        self.linear1 = nn.Linear(2*context_size*embd_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, inputs):
        embedded = self.embeddings(inputs).float().view((1, -1))
        hid = F.relu(self.linear1(embedded))
        out = self.linear2(hid)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


In [None]:
window_size = 2 
w2v_dataset = Word2VecText(tokenized_text_train, window_size=window_size, variant='cbow')
hidden_size = 300
pretrained_cbow_model = CBOW(pretrained_npy, window_size, hidden_size)

In [None]:
learning_rate = 0.003
criterion = nn.NLLLoss()
optimizer = optim.SGD(pretrained_cbow_model.parameters(), lr=learning_rate)

losses = []

model = nn.DataParallel(pretrained_cbow_model)

num_epochs = 100
for _e in tqdm(range(num_epochs)):
    epoch_loss = []
    for sent_idx in range(w2v_dataset._len):
        for w2v_io in w2v_dataset[sent_idx]:
            # Retrieve the inputs and outputs.
            x = tensor(w2v_io['x']).to(device)
            y = autograd.Variable(tensor(w2v_io['y'], dtype=torch.long)).to(device)
            
            if -1 in x or int(y) == -1:
                continue
            # Zero gradient.
            model.zero_grad()
            # Calculate the log probability of the context embeddings.
            logprobs = pretrained_cbow_model(x)
            # This unsqueeze thing is really a feature/bug... -_-
            loss = criterion(logprobs, y.unsqueeze(0)) 
            loss.backward()
            optimizer.step()
            epoch_loss.append(float(loss))
    # Save model after every epoch.
    torch.save(model.state_dict(), 'cbow_finetuning_checkpoint_{}.pt'.format(_e))
    losses.append(sum(epoch_loss)/len(epoch_loss))



<a id="section-3-1-6-reval-cbow"></a>

## Re-Test Pretrained Embeddings on the CBOW Task

In [None]:

true_positive = 0
all_data = 0
# Iterate through the test sentences. 
for sent in tokenized_text_test:
    # Extract all the CBOW contexts (X) and targets (Y)
    for w2v_io in w2v_dataset._iterator(w2v_dataset.vectorize(sent)):
        # Retrieve the inputs and outputs.
        x = tensor(w2v_io['x']).to(device)
        y = tensor(w2v_io['y']).to(device)
        
        if -1 in x: # Skip unknown words.
            continue
        with torch.no_grad():
            _, prediction =  torch.max(pretrained_cbow_model(x), 1)
        true_positive += int(prediction) == int(y)
        visualize_predictions(x, y, prediction, w2v_dataset.vocab, window_size=window_size)
        all_data += 1

In [None]:
print('Accuracy:', true_positive/all_data)