In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import re

import nltk
from nltk.tokenize import word_tokenize

## Continous Bag of Words (CBoW)

Let's take a look at the following sentence:
'I am happy because I am learning'.

- 𝑐𝑜𝑛𝑡𝑒𝑥𝑡=[𝐼,𝑎𝑚,𝑏𝑒𝑐𝑎𝑢𝑠𝑒,𝐼]
- 𝑡𝑎𝑟𝑔𝑒𝑡=ℎ𝑎𝑝𝑝𝑦

For a window size of C=2

In [2]:
corpus = [
    'I am happy beacause i am learning',
    'Hello this is a wonderful world',
    'The capital of India is Delhi',
    'The capital of England is London',
    'I am sad because i did not win the match',
    'He is a good person',
    'She offered me a job',
    'I celebrated diwali and was happy'
]

WINDOW_SIZE=2
BATCH_SIZE=2

In [21]:
def tokenize_corpus(data):
    return [d.lower().split() for d in data]

corpus_token = tokenize_corpus(corpus)

corpus_token

[['i', 'am', 'happy', 'beacause', 'i', 'am', 'learning'],
 ['hello', 'this', 'is', 'a', 'wonderful', 'world'],
 ['the', 'capital', 'of', 'india', 'is', 'delhi'],
 ['the', 'capital', 'of', 'england', 'is', 'london'],
 ['i', 'am', 'sad', 'because', 'i', 'did', 'not', 'win', 'the', 'match'],
 ['he', 'is', 'a', 'good', 'person'],
 ['she', 'offered', 'me', 'a', 'job'],
 ['i', 'celebrated', 'diwali', 'and', 'was', 'happy']]

In [4]:
def generate_vocab(corpus):
    vocab=[]
    for sentence in corpus_token:
        for token in sentence:
            if token not in vocab:
                vocab.append(token)
    
    return vocab, len(vocab)

vocab, len_vocab = generate_vocab(corpus_token)

print(f'Total vocabulary size: {len_vocab}')
print(vocab)

Total vocabulary size: 35
['i', 'am', 'happy', 'beacause', 'learning', 'hello', 'this', 'is', 'a', 'wonderful', 'world', 'the', 'capital', 'of', 'india', 'delhi', 'england', 'london', 'sad', 'because', 'did', 'not', 'win', 'match', 'he', 'good', 'person', 'she', 'offered', 'me', 'job', 'celebrated', 'diwali', 'and', 'was']


In [5]:
word2indx = { w:idx for idx, w in enumerate(vocab)}
indx2word = { idx:w for idx, w in enumerate(vocab)}

print(word2indx['world'])
print(indx2word[10])

10
world


In [6]:
def one_hot_encoding(word_indx, vocab):
    '''
        return one hot encoding of the word indices
    '''
    vocab_len = len(vocab)
    one_hot_content = []
    
    for idx in word_indx:
        word_padd = np.zeros(vocab_len)
        word_padd[idx] = 1  
        one_hot_content.append(word_padd)
        
    return one_hot_content
    

In [34]:
def feature_extraction(corpus_token, window_size, vocab):
    '''
     Get the middle word and its corresponding context words based on window size
    '''
    X_list = []
    y_list = []
    
    for sentences in corpus_token:
        indices = [word2indx[word] for word in sentences]
        
        for i in range(len(sentences)):
            start_indx = i - window_size
            end_indx = i + window_size + 1
            center_indx = i
            
            context_words = indices[start_indx:i] + indices[center_indx+1:end_indx]
            center_word = [indices[center_indx]]
            
            num_context_words = len(context_words)
            oh_context_words = one_hot_encoding(context_words, vocab)
            oh_center_word = one_hot_encoding(center_word, vocab)
            
            X = sum(oh_context_words)/num_context_words
            y = np.array(oh_center_word).reshape(len(vocab),)
            
            #yield X, y
            X_list.append(X)
            y_list.append(y)
            
    return X_list, y_list

In [39]:
X, y = feature_extraction(corpus_token, WINDOW_SIZE, vocab)

print(f'The shape of X: {X[1].shape} and the shape of y: {y[1].shape}')

The shape of X: (35,) and the shape of y: (35,)


In [40]:
class CBOWDataset(Dataset):
    
    def __init__(self, data, window_size, vocab):
        self.data = data
        self.window_size = window_size
        self.vocab = vocab
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        x, y = feature_extraction(self.data, self.window_size, self.vocab)
        
        X_transformed = torch.FloatTensor(x[index])
        y_transformed = torch.LongTensor(y[index])
        
        return X_transformed, y_transformed

In [43]:
cbowds = CBOWDataset(corpus_token, WINDOW_SIZE, vocab)

for x, y in cbowds:
    print(x, y)

tensor([0.0000, 0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]) tensor([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0.0000, 0.0000, 0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]) tensor([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0.5000, 0.2500, 0.0000, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.

In [48]:
## batch loader

cbow_dataloader = DataLoader(cbowds, shuffle=False, batch_size=2)

In [72]:
for b, dl in enumerate(cbow_dataloader):
    break
    
print(dl[0].shape)
print(dl[1].shape)

torch.Size([2, 35])
torch.Size([2, 35])


In [58]:
class CBOWEmbeddingModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        
        return nn.functional.log_softmax(x)

In [64]:
model = CBOWEmbeddingModel(10,20)
model

CBOWEmbeddingModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
)

In [65]:
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [66]:
epochs = 100

acc_loss = []

for epoch in range(epochs):
    
    for b, (x, y) in enumerate(cbow_dataloader):
        
        y_predict = model(x)
        losses = criterion(y_predict, y)
        
        if b%10 == 0:
            print(f'Epoch: {epoch} \t Batch:{b} \t Loss:{losses:10.8f}')
            
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
            
    acc_loss.append(losses)

RuntimeError: size mismatch, m1: [2 x 35], m2: [10 x 20] at /Users/distiller/project/conda/conda-bld/pytorch_1595629449223/work/aten/src/TH/generic/THTensorMath.cpp:41