# CBOW model trained on 20000 lieues sous les mers

## Needed libraries

You will need the following new libraries:

-   `spacy` for tokenizing
-   `gensim` for cosine similarities (use `gensim>=4.0.0`)

You will also need to download rules for tokenizing a french text.

``` bash
python -m spacy download fr_core_news_sm
```

In [1]:
import numpy as np
import torch
from torch import nn
import torch.optim as optim

import spacy
from gensim.models.keyedvectors import KeyedVectors

## Tokenizing the corpus

In [61]:
# Use a french tokenizer to create a tokenizer for the french language
spacy_fr = spacy.load("fr_core_news_sm")
with open("data/20_000_lieues_sous_les_mers.txt", "r", encoding="utf-8") as f:
    document = spacy_fr.tokenizer(f.read())



# Define a filtered set of tokens by iterating on `document`. Define a
# subset of tokens that are
#
# - alphanumeric
# - in lower case
tokens = [
    tok.text.lower()
    for tok in document if tok.is_alpha or tok.is_digit
][:10000]

# # Make a list of unique tokens and dictionary that maps tokens to
# # their index in that list.
idx2tok = {idx:token for idx, token in enumerate(tokens)}
tok2idx = {token:idx for idx,token in idx2tok.items()}

idx2tok = list(set(tokens))
tok2idx = {token: i for i, token in enumerate(idx2tok)}

In [63]:
len(tokens)

10000

## The continuous bag of words model

In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        # Define an Embedding module (`nn.Embedding`) and a linear
        # transform (`nn.Linear`) without bias.
        self.embeddings = nn.Embedding(self.vocab_size, self.embedding_size)
        self.U_transpose = nn.Linear(self.embedding_size, self.vocab_size, bias=False)

    def forward(self, context):
        # Implements the forward pass of the CBOW model
        # `context` is of size `batch_size` * NGRAMS

        # `e_i` is of size `batch_size` * NGRAMS * `embedding_size`
        e_i = self.embeddings(context)
        
        # `e_bar` is of size `batch_size` * `embedding_size`
        e_bar = torch.mean(e_i, dim=1)

        # `UT_e_bar` is of size `batch_size` * `vocab_size`
        UT_e_bar = self.U_transpose(e_bar)

        return UT_e_bar


# Set the size of vocabulary and size of embedding
VOCAB_SIZE = len(idx2tok)
EMBEDDING_SIZE = 32

# Create a Continuous bag of words model
cbow = CBOW(VOCAB_SIZE, EMBEDDING_SIZE)

# Send to GPU if any
device = "cuda:0" if torch.cuda.is_available() else "cpu"
cbow.to(device)

CBOW(
  (embeddings): Embedding(2806, 32)
  (U_transpose): Linear(in_features=32, out_features=2806, bias=False)
)

## Preparing the data

In [127]:
# Generate n-grams for a given list of tokens, use yield, use window length of n-grams
def ngrams_iterator(token_list, ngrams):
    """Generates successive N-grams from a list of tokens."""

    for i in range(len(token_list) - ngrams + 1):
        idxs = [tok2idx[tok] for tok in token_list[i:i+ngrams]]

        # Get center element in `idxs`
        center = idxs.pop(ngrams // 2)

        # Yield the index of center word and indexes of context words
        # as a Numpy array (for Pytorch to automatically convert it to
        # a Tensor).
        yield center, np.array(idxs)


# Create center, context data
NGRAMS = 5
ngrams = list(ngrams_iterator(tokens, NGRAMS))

BATCH_SIZE = 512
data = torch.utils.data.DataLoader(ngrams, batch_size=BATCH_SIZE, shuffle=True)

In [66]:
document[:6]

  VINGT MILLE LIEUES
  SOUS

In [67]:
for i in ngrams[0][1]:
    print(idx2tok[i])

vingt
mille
sous
les


In [68]:
print(idx2tok[ngrams[0][0]])

lieues


## Learn CBOW model

In [69]:
# Gradient descent algorithm to use
optimizer = optim.Adam(cbow.parameters(), lr=0.01)

# Use a cross-entropy loss from the `nn` submodule
ce_loss = nn.CrossEntropyLoss()

In [130]:
EPOCHS = 20
for epoch in range(1, EPOCHS + 1):
    total_loss = 0
    for i, (center, context) in enumerate(data):
        center, context = center.to(device), context.to(device)

        # Reset the gradients of the computational graph
        optimizer.zero_grad()
        # Forward pass
        UT_ebar = cbow(context)

        # Compute negative log-likelihood loss averaged over the
        # mini-batch
        loss = ce_loss(UT_ebar, center)

        # Backward pass to compute gradients of each parameter
        loss.backward()

        # Gradient descent step according to the chosen optimizer
        optimizer.step()

        total_loss += loss.data

        if i % 20 == 0:
            loss_avg = float(total_loss / (i + 1))
            print(
                f"Epoch ({epoch}/{EPOCHS}), batch: ({i}/{len(data)}), loss: {loss_avg}"
            )

    # Print average loss after each epoch
    loss_avg = float(total_loss / len(data))
    print("{}/{} loss {:.2f}".format(epoch, EPOCHS, loss_avg))

    # Predict if `predict_center_word` is implemented
    try:
        left_words = ["le", "capitaine"]
        right_words = ["me", "dit"]
        word = predict_center_word(word2vec, *left_words, *right_words)[0]
        print(" ".join(left_words + [word] + right_words))
    except NameError:
        pass

<built-in method size of Tensor object at 0x7f2cddd86210>
Epoch (1/20), batch: (0/20), loss: 7.9730963706970215
<built-in method size of Tensor object at 0x7f2cddd69d60>
<built-in method size of Tensor object at 0x7f2cddac7f70>
<built-in method size of Tensor object at 0x7f2cddd69d60>
<built-in method size of Tensor object at 0x7f2cddd62ee0>
<built-in method size of Tensor object at 0x7f2cdda612c0>
<built-in method size of Tensor object at 0x7f2cddac7f70>
<built-in method size of Tensor object at 0x7f2cddd84e60>
<built-in method size of Tensor object at 0x7f2cc8557ca0>
<built-in method size of Tensor object at 0x7f2cde60c820>
<built-in method size of Tensor object at 0x7f2cddd84e60>
<built-in method size of Tensor object at 0x7f2cc539e580>
<built-in method size of Tensor object at 0x7f2cc539e580>
<built-in method size of Tensor object at 0x7f2cc84f4910>
<built-in method size of Tensor object at 0x7f2cc8557ca0>
<built-in method size of Tensor object at 0x7f2cddac7f70>
<built-in method s

## Prediction functions

Now that the model is learned we can give it a context it has never seen
and see what center word it predicts.

In [121]:
def predict_center_word_idx(cbow, *context_words_idx, k=10):
    """Return k-best center words given indexes of context words."""

    # Create a fake minibatch containing just one example
    fake_minibatch = torch.tensor(context_words_idx).unsqueeze(0).to(device)

    # Forward propagate through the cbow model
    score_center = cbow(fake_minibatch)

    # Retrieve top k-best indexes using `torch.topk`
    _, best_idxs = torch.topk(score_center, k=k)

    # Return actual tokens using `idx2tok`
    return best_idxs


def predict_center_word(cbow, *context_words, k=10):
    """Return k-best center words given context words."""

    idxs = [tok2idx[tok] for tok in context_words]
    sol = predict_center_word_idx(cbow, *idxs, k=k).squeeze()
    pred = [idx2tok[idx] for idx in sol]

    return pred

In [122]:
predict_center_word(cbow, "vingt", "mille", "sous", "les")
# predict_center_word(cbow, "mille", "lieues", "les", "mers")
# predict_center_word(cbow, "le", "capitaine", "fut", "le")
# predict_center_word(cbow, "le", "commandant", "fut", "le")

['lieues',
 'pieds',
 'six',
 'illustration',
 'chances',
 'obscurité',
 'trois',
 'parages',
 'kilogrammes',
 'longitude']

## Testing the embedding

We use the library `gensim` to easily compute most similar words for the
embedding we just learned. Use `gensim>=4.0.0`.

In [123]:
m = KeyedVectors(vector_size=EMBEDDING_SIZE)
m.add_vectors(idx2tok, cbow.embeddings.weight.detach().cpu().numpy())

You can now test most similar words for, for example “lieues”, “mers”,
“professeur”… You can look at `words_decreasing_freq` to test most
frequent tokens.

In [124]:
unique, freq = np.unique(tokens, return_counts=True)
idxs = freq.argsort()[::-1]
words_decreasing_freq = list(zip(unique[idxs], freq[idxs]))

In [125]:
m.most_similar('lieues')

[('pression', 0.5873574018478394),
 ('tonnes', 0.5782575011253357),
 ('saluai', 0.5681819915771484),
 ('cents', 0.5399743914604187),
 ('risque', 0.5307956337928772),
 ('muni', 0.5231773257255554),
 ('rocher', 0.5148242115974426),
 ('atmosphères', 0.5128464102745056),
 ('chevalier', 0.5099048614501953),
 ('pieds', 0.5009294152259827)]