In [173]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import re

import nltk
from nltk.tokenize import word_tokenize

## Continous Bag of Words (CBoW)

Let's take a look at the following sentence:
'I am happy because I am learning'.

- 𝑐𝑜𝑛𝑡𝑒𝑥𝑡=[𝐼,𝑎𝑚,𝑏𝑒𝑐𝑎𝑢𝑠𝑒,𝐼]
- 𝑡𝑎𝑟𝑔𝑒𝑡=ℎ𝑎𝑝𝑝𝑦

For a window size of C=2

In [174]:
corpus = [
    'I am happy beacause i am learning',
    'Hello this is a wonderful world',
    'The capital of India is Delhi',
    'The capital of England is London',
    'I am sad because i did not win the match',
    'He is a good person',
    'She offered me a job',
    'I celebrated diwali and was happy'
]

WINDOW_SIZE=2
BATCH_SIZE=2

In [175]:
def tokenize_corpus(data):
    return [d.lower().split() for d in data]

corpus_token = tokenize_corpus(corpus)

corpus_token

[['i', 'am', 'happy', 'beacause', 'i', 'am', 'learning'],
 ['hello', 'this', 'is', 'a', 'wonderful', 'world'],
 ['the', 'capital', 'of', 'india', 'is', 'delhi'],
 ['the', 'capital', 'of', 'england', 'is', 'london'],
 ['i', 'am', 'sad', 'because', 'i', 'did', 'not', 'win', 'the', 'match'],
 ['he', 'is', 'a', 'good', 'person'],
 ['she', 'offered', 'me', 'a', 'job'],
 ['i', 'celebrated', 'diwali', 'and', 'was', 'happy']]

In [176]:
def generate_vocab(corpus):
    vocab=[]
    for sentence in corpus_token:
        for token in sentence:
            if token not in vocab:
                vocab.append(token)
    
    return vocab, len(vocab)

vocab, len_vocab = generate_vocab(corpus_token)

print(f'Total vocabulary size: {len_vocab}')
print(vocab)

Total vocabulary size: 35
['i', 'am', 'happy', 'beacause', 'learning', 'hello', 'this', 'is', 'a', 'wonderful', 'world', 'the', 'capital', 'of', 'india', 'delhi', 'england', 'london', 'sad', 'because', 'did', 'not', 'win', 'match', 'he', 'good', 'person', 'she', 'offered', 'me', 'job', 'celebrated', 'diwali', 'and', 'was']


In [177]:
word2indx = { w:idx for idx, w in enumerate(vocab)}
indx2word = { idx:w for idx, w in enumerate(vocab)}

print(word2indx['world'])
print(indx2word[10])

10
world


In [178]:
def one_hot_encoding(word_indx, vocab):
    '''
        return one hot encoding of the word indices
    '''
    vocab_len = len(vocab)
    one_hot_content = []
    
    for idx in word_indx:
        word_padd = np.zeros(vocab_len)
        word_padd[idx] = 1  
        one_hot_content.append(word_padd)
        
    return one_hot_content
    

In [259]:
def feature_extraction(corpus_token, window_size, vocab):
    '''
     Get the middle word and its corresponding context words based on window size
    '''
    
    for sentences in corpus_token:
        indices = [word2indx[word] for word in sentences]
        
        for i in range(len(sentences)):
            start_indx = i - window_size
            end_indx = i + window_size + 1
            center_indx = i
            
            context_words = indices[start_indx:i] + indices[center_indx+1:end_indx]
            center_word = [indices[center_indx]]
            
            num_context_words = len(context_words)
            oh_context_words = one_hot_encoding(context_words, vocab)
            oh_center_word = one_hot_encoding(center_word, vocab)
            
            X = sum(oh_context_words)/num_context_words
            y = np.array(oh_center_word).reshape(len(vocab),)
            
            yield X, y
            

In [260]:
X, y = next(feature_extraction(corpus_token, WINDOW_SIZE, vocab))

print(f'The shape of X: {X.shape} and the shape of y: {y.shape}')

The shape of X: (35,) and the shape of y: (35,)


In [306]:
for i,(a,b) in enumerate(feature_extraction(corpus_token, WINDOW_SIZE, vocab)):
    if i<2:
        print(i,'\t\t',a)

0 		 [0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
1 		 [0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]


In [307]:
X

array([0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])

In [308]:
def generate_batch(corpus, window_size, vocab, batch_size=2):
    all_contents_X = []
    all_contents_y = []
    batch_X = []
    batch_y = []
    
    for X,y in feature_extraction(corpus, window_size, vocab):
        all_contents_X.append(X)
        all_contents_y.append(y)
    
    for i, items in enumerate(all_contents_X):
        while len(batch_X) < batch_size:
            batch_X.append(all_contents_X[i])
            batch_y.append(all_contents_y[i])
        else:
            yield np.array(batch_X).T, np.array(batch_y).T
            batch_X = []
            batch_y = []

In [309]:
X1, y1 = next(generate_batch(corpus_token, WINDOW_SIZE, vocab, BATCH_SIZE))

print(f'The shape of X: {X1.shape} and the shape of y: {y1.shape}')

The shape of X: (35, 2) and the shape of y: (35, 2)


In [310]:
X1

array([[0. , 0. ],
       [0.5, 0.5],
       [0.5, 0.5],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ],
       [0. , 0. ]])

In [311]:
y1

array([[1., 1.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [288]:
class CBOWEmbeddingModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        x = self.log_softmax(self.fc1(x))
        return x

In [289]:
model = CBOWEmbeddingModel(10,20)
model

CBOWEmbeddingModel(
  (fc1): Linear(in_features=10, out_features=20, bias=True)
)

In [290]:
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [232]:
epochs = 100

acc_loss = []

for epoch in range(epochs):
    #print(epoch)
    pass