# Continuous Bag of Words (CBOW)

© Data Trainers LLC. GPL v 3.0.

**Author:** Axel Sirota


In this notebook we will train from scratch a CBOW word embedding model based on a famous dataset: The Yelp reviews dataset. This dataset is uploaded into a dropbox and the cell command to download the files is already done for you.

Take it easy and pay attention to the model, how easy it is to define it,and the iteration nuances on the dataset generation.

You can run this lab both locally or in Colab.

- To run in Colab just go to `https://colab.research.google.com`, sign-in and you upload this notebook. Colab has GPU access for free.
- To run locally just run `jupyter notebook` and access the notebook in this lab. You would need to first install the requirements in `requirements.txt`

Follow the instructions. Good luck!



In [None]:
!pip install --upgrade  textblob gensim pytorch-nlp

In [None]:
import multiprocessing
import torch
import torch.nn as nn
import torch.optim as optim
import itertools
import sys
from textblob import TextBlob, Word
import numpy as np
import random
import os
import pandas as pd
import gensim
import warnings
import nltk

embedding_dim = 50
epochs=100


def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  random.seed(42)


set_seeds_and_trace()
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words

In [None]:
%%writefile get_data.sh
if [ ! -f yelp.csv ]; then
  wget -O yelp.csv https://www.dropbox.com/s/xds4lua69b7okw8/yelp.csv?dl=0
fi

In [None]:
!bash get_data.sh

In [None]:
path = './yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
X = yelp_best_worst.text
y = yelp_best_worst.stars.map({1:0, 5:1})

In [None]:
# Create corpus of sentences such that the sentence has more than 3 words
corpus = [None]

At this point we have a list (any iterable will do) of queries that are longer than 3 words. This is normal to filter random queries. Now we must use the `Tokenizer` object to `fit` on the corpus, in order to convert each wor to an ID, and later convert such corpus of list of words into their identifiers.


In [None]:
import itertools
from torchnlp.encoders import LabelEncoder

# Again, use the LabelEncoder to create the tokenizer and fit it.
ids_from_words = None

print(f'Before the tokenizer: {corpus[:1]}')

#Now use the same "trained" tokenizer to convert the corpus from words to IDs with the batch_encode method
tokenized_corpus = None

print(f'After the tokenizer: {tokenized_corpus[:1]}')

In [None]:
vocab_size = len(ids_from_words.vocab)



In [None]:
print(f'First 5 corpus items are {tokenized_corpus[:5]}')
print(f'Length of corpus is {len(tokenized_corpus)}')



In [None]:
type(tokenized_corpus)

In [None]:
def ids_from_text(text):
  return ids_from_words.batch_encode(text)

def text_from_ids(ids):
  return ids_from_words.batch_decode(ids)

In [None]:
def pad_sequence_of_tokens(x, maxlen, unk_token='UNK'):
  if len(x)<maxlen:
    x.extend([unk_token]*(maxlen-len(x)))
  return x

In [None]:
# This is the algorithmic part of batching the dataset and yielding the window of words and expected middle word for each bacth as a generator.
def create_context_target_pairs(texts, context_size):
    data = []
    for text in texts:
        tokens = text.split()
        for i, word in enumerate(tokens):
            start = max(0, i - context_size)
            end = min(len(tokens), i + context_size + 1)
            context = pad_sequence_of_tokens([tokens[j] for j in range(start, end) if j != i], maxlen=4)
            target = ids_from_words.token_to_index[word]
            context_indices = [ids_from_words.token_to_index[w] for w in context]
            context_indices.append(target)
            data.append(torch.Tensor(context_indices))
    return data

Notice now in a sample how we construct X and y to predict words

In [None]:
data = create_context_target_pairs(corpus[:500], 2) # we use 500 words to make the RAM not crash

In [None]:
# Stack the tensors to create a 2D tensor
data = None

In [None]:
#Set X, and y

X = None
y = None

Now comes the core part, defining the model. Let's add an `Embedding` layer (that will map the word ids into a vector of size 100), a `Lambda` to average the words out in a sentence, and a `Dense layer` to select the best word on the other end. This is classic CBOW.


In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super().__init__()
        self.embeddings = None # Add the embedding layer, which should be the dimensions?
        # Linear layer to act as the hidden layer
        self.linear1 = None # Make up the hiddden dimension
        # Linear layer to predict the center word
        self.linear2 = None # Final Linear Layer, how many output neurons should we have?

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds = torch.mean(embeds, dim=1)  # This is to average across words.  key!
        out = torch.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = torch.log_softmax(out, dim=1)
        return log_probs


In [None]:
def train_cbow(X, y, model, loss_function, optimizer, epochs):
    for epoch in range(epochs):
        total_loss = 0

        # Step 1. Recall that torch *accumulates* gradients. Before passing in a new instance,
        # you need to zero out the gradients from the old instance
        None

        # Step 2. Run the forward pass, getting log probabilities over next words
        log_probs = None

        # Step 3. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = None

        # Step 4. Do the backward pass and update the gradient
        None
        None

        total_loss += loss.item()
        # Print progress
        if (epoch + 1) % 10 == 0:
            print('Epoch: {}, Loss: {:.4f}'.format(epoch + 1, total_loss))
    return model

In [None]:
context_size=2
embedding_dim=50
vocab_size = len(ids_from_words.vocab)
model = CBOW(vocab_size, embedding_dim, context_size * 2)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)


In [None]:
trained_model = train_cbow(X, y, model, loss_function, optimizer, epochs=1)

In [None]:
import gensim
from gensim.models.keyedvectors import KeyedVectors

embeddings = trained_model.embeddings.weight.data.cpu().numpy()

# Now, we need to save these embeddings in a format that gensim can understand
# For that, we will use the KeyedVectors instance in gensim

# Instantiate the KeyedVectors with the correct size
kv = KeyedVectors(vector_size=embeddings.shape[1])






In [None]:
# Add the vectors and their corresponding words to the KeyedVectors instance
kv.add_vectors(ids_from_words.index_to_token, embeddings)

In [None]:
kv.most_similar(positive=['gasoline'])

In [None]:
kv.most_similar(negative=['apple'])