<a href="https://colab.research.google.com/github/luchorivera/Hands-On-Data-Analysis-with-Pandas/blob/master/CBOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://subscription.packtpub.com/book/data/9781789802740/5/ch05lvl1sec20/exploring

Hands-On Natural Language Processing with PyTorch 1.x
By Thomas Dop
July 2020

In [1]:
import torch
import torch.nn as nn
import numpy as np

In [2]:
text = """How that personage haunted my dreams, I need scarcely tell you. On
stormy nights, when the wind shook the four corners of the house and
the surf roared along the cove and up the cliffs, I would see him in a
thousand forms, and with a thousand diabolical expressions. Now the leg
would be cut off at the knee, now at the hip, now he was a monstrous
kind of a creature who had never had but the one leg, and that in the
middle of his body. To see him leap and run and pursue me over hedge and
ditch was the worst of nightmares. And altogether I paid pretty dear for
my monthly fourpenny piece, in the shape of these abominable fancies"""

text = text.replace(',','').replace('.','').lower().split()

In [3]:
print(text)

['how', 'that', 'personage', 'haunted', 'my', 'dreams', 'i', 'need', 'scarcely', 'tell', 'you', 'on', 'stormy', 'nights', 'when', 'the', 'wind', 'shook', 'the', 'four', 'corners', 'of', 'the', 'house', 'and', 'the', 'surf', 'roared', 'along', 'the', 'cove', 'and', 'up', 'the', 'cliffs', 'i', 'would', 'see', 'him', 'in', 'a', 'thousand', 'forms', 'and', 'with', 'a', 'thousand', 'diabolical', 'expressions', 'now', 'the', 'leg', 'would', 'be', 'cut', 'off', 'at', 'the', 'knee', 'now', 'at', 'the', 'hip', 'now', 'he', 'was', 'a', 'monstrous', 'kind', 'of', 'a', 'creature', 'who', 'had', 'never', 'had', 'but', 'the', 'one', 'leg', 'and', 'that', 'in', 'the', 'middle', 'of', 'his', 'body', 'to', 'see', 'him', 'leap', 'and', 'run', 'and', 'pursue', 'me', 'over', 'hedge', 'and', 'ditch', 'was', 'the', 'worst', 'of', 'nightmares', 'and', 'altogether', 'i', 'paid', 'pretty', 'dear', 'for', 'my', 'monthly', 'fourpenny', 'piece', 'in', 'the', 'shape', 'of', 'these', 'abominable', 'fancies']


In [4]:
corpus = set(text)
corpus_length = len(corpus)

word_dict = {}
inverse_word_dict = {}

for i, word in enumerate(corpus):
    word_dict[word] = i
    inverse_word_dict[i] = word

data = []

for i in range(2, len(text) - 2):
    sentence = [text[i-2], text[i-1],
               text[i+1], text[i+2]]
    target = text[i]
    data.append((sentence, target))
    
print(data[3])

(['haunted', 'my', 'i', 'need'], 'dreams')


In [5]:
embedding_length = 20

class CBoW(torch.nn.Module):

    def __init__(self, corpus_length, embedding_dim):
        super(CBoW, self).__init__()
        
        self.embeddings = nn.Embedding(corpus_length, embedding_dim)

        self.linear1 = nn.Linear(embedding_dim, 64)
        self.linear2 = nn.Linear(64, corpus_length)
        
        self.activation_function1 = nn.ReLU()
        self.activation_function2 = nn.LogSoftmax(dim = -1)

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.LongTensor([word_dict[word]])
        return self.embeddings(word).view(1,-1)

In [6]:
model = CBoW(corpus_length, embedding_length)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

def make_sentence_vector(sentence, word_dict):
    idxs = [word_dict[w] for w in sentence]
    return torch.tensor(idxs, dtype=torch.long)

print(make_sentence_vector(['stormy','nights','when','the'], word_dict))

tensor([13, 78, 61, 68])


In [7]:
for epoch in range(100):
    epoch_loss = 0
    for sentence, target in data:
        model.zero_grad()
        sentence_vector = make_sentence_vector(sentence, word_dict)  
        log_probs = model(sentence_vector)
        loss = loss_function(log_probs, torch.tensor([word_dict[target]], dtype=torch.long))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.data
    print('Epoch: '+str(epoch)+', Loss: ' + str(epoch_loss.item()))

Epoch: 0, Loss: 543.7525634765625
Epoch: 1, Loss: 481.09637451171875
Epoch: 2, Loss: 437.6916809082031
Epoch: 3, Loss: 399.732177734375
Epoch: 4, Loss: 363.3277282714844
Epoch: 5, Loss: 326.68975830078125
Epoch: 6, Loss: 289.6161804199219
Epoch: 7, Loss: 252.344482421875
Epoch: 8, Loss: 216.13104248046875
Epoch: 9, Loss: 181.6922607421875
Epoch: 10, Loss: 149.91403198242188
Epoch: 11, Loss: 122.03185272216797
Epoch: 12, Loss: 98.26849365234375
Epoch: 13, Loss: 78.60071563720703
Epoch: 14, Loss: 62.97425079345703
Epoch: 15, Loss: 50.74811553955078
Epoch: 16, Loss: 41.1762809753418
Epoch: 17, Loss: 33.832183837890625
Epoch: 18, Loss: 28.221006393432617
Epoch: 19, Loss: 23.857149124145508
Epoch: 20, Loss: 20.491724014282227
Epoch: 21, Loss: 17.829341888427734
Epoch: 22, Loss: 15.712153434753418
Epoch: 23, Loss: 13.987968444824219
Epoch: 24, Loss: 12.581228256225586
Epoch: 25, Loss: 11.39668083190918
Epoch: 26, Loss: 10.401822090148926
Epoch: 27, Loss: 9.558734893798828
Epoch: 28, Loss: 8.

In [8]:
def get_predicted_result(input, inverse_word_dict):
    index = np.argmax(input)
    return inverse_word_dict[index]

def predict_sentence(sentence):
    sentence_split = sentence.replace('.','').lower().split()
    sentence_vector = make_sentence_vector(sentence_split, word_dict)
    prediction_array = model(sentence_vector).data.numpy()
    print('Preceding Words: {}\n'.format(sentence_split[:2]))
    print('Predicted Word: {}\n'.format(get_predicted_result(prediction_array[0], inverse_word_dict)))
    print('Following Words: {}\n'.format(sentence_split[2:]))

predict_sentence('to see leap and')

Preceding Words: ['to', 'see']

Predicted Word: him

Following Words: ['leap', 'and']



In [9]:
print(model.get_word_emdedding('leap'))

tensor([[ 0.0706, -0.2392,  0.3957,  0.5724, -0.4644, -0.0312, -1.9006,  0.6281,
         -1.5343, -0.6428, -0.6508,  0.2963, -0.9421, -0.0606,  0.1141, -0.0499,
         -1.0135, -0.6442, -1.4822, -0.0357]], grad_fn=<ViewBackward>)


In [10]:
predict_sentence('a monstrous of a')

Preceding Words: ['a', 'monstrous']

Predicted Word: kind

Following Words: ['of', 'a']

