# NLP

## Classificando documentos usando BoW

Primeiro vamos dar uma revisada no softmax pra ver se entedemos bem:

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

# Softmax is also in torch.nn.functional
data = torch.randn(5)
print(data)
print(F.softmax(data, dim=0))
print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!
print(F.log_softmax(data, dim=0))  # theres also log_softmax
print(F.softmax(data, dim=0).log())

tensor([ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519])
tensor([0.2847, 0.1919, 0.1563, 0.2735, 0.0935])
tensor(1.)
tensor([-1.2563, -1.6507, -1.8559, -1.2963, -2.3695])
tensor([-1.2563, -1.6507, -1.8559, -1.2963, -2.3695])


In [80]:
data = [("O Henrico é um cara legal".split(), "PORTUGUESE"),
        ("Give it to me".split(), "ENGLISH"),
        ("O Bolsonaro é uma pessoa especial".split(), "PORTUGUESE"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("O ronaldinho gaúcho é um cara misterioso .".split(), "PORTUGUESE"),
             ("it is lost on me".split(), "ENGLISH")]

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
label_to_ix = {"PORTUGUESE": 0, "ENGLISH": 1}

for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = len(label_to_ix)


class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super().__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, 100)
        self.conv_1d = nn.Conv1d(1, 50, 5)
        self.output = nn.Linear(96*50, num_labels)
        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        print('input:', bow_vec.shape)
        
        # (bs, doc_size) -> (bs, 100)  
        x = self.linear(bow_vec)
        x = torch.relu(x)
#         print('linear:', x.shape)

        # (bs, 100) -> (bs, 1, 100) 
        x = x.unsqueeze(1)
#         print('linear:', x.shape)
        
        # (bs, 1, 100) -> (bs, 96, 50) 
        x = self.conv_1d(x)
#         print('conv:', x.shape)
        x = torch.sigmoid(x)
        
        # (bs, 96, 50)  -> (bs, 96*50) 
        x = x.view(1, -1)
#         print('flatted', x.shape)
        
        # (bs, 96*50)  -> (bs, 2) 
        x = self.output(x)
#         print('output', x.shape)
        return F.log_softmax(x, dim=1)


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

{'O': 0, 'Henrico': 1, 'é': 2, 'um': 3, 'cara': 4, 'legal': 5, 'Give': 6, 'it': 7, 'to': 8, 'me': 9, 'Bolsonaro': 10, 'uma': 11, 'pessoa': 12, 'especial': 13, 'No': 14, 'is': 15, 'not': 16, 'a': 17, 'good': 18, 'idea': 19, 'get': 20, 'lost': 21, 'at': 22, 'sea': 23, 'ronaldinho': 24, 'gaúcho': 25, 'misterioso': 26, '.': 27, 'on': 28}


In [29]:
print(make_bow_vector(data[0][0], word_to_ix))
print(make_target(data[0][1], label_to_ix))

tensor([[1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([0])


In [76]:
model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the PyTorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
# for param in model.parameters():
#     print(param)

# To run the model, pass in a BoW vector
# Here we don't need to train, so the code is wrapped in torch.no_grad()
# with torch.no_grad():
#     sample = data[1]
#     bow_vector = make_bow_vector(sample[0], word_to_ix)
#     log_probs = model(bow_vector)
# #     print(log_probs.exp())

Nossa entrada é um bag of words, então 

    doc_0 = [count(v_0), count(v_1), count(v_2), ..., count(v_n)]
    doc_1 = [count(v_0), count(v_1), count(v_2), ..., count(v_n)]
    
Nossa rede neural é uma camda linear seguida por um log_softmax (é uma regressão logística!): 

    net = log_softmax(linear(doc))

Bora treinar uma pouco essa rede

In [77]:
# Print the matrix column corresponding to "cara"
print(next(model.parameters())[:, word_to_ix["cara"]])

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(100):
    for sentence, label in data:
        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Tensor as an integer. For example, if the target is PORTUGUESE, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to PORTUGUESE
        bow_vec = make_bow_vector(sentence, word_to_ix)
        target = make_target(label, label_to_ix)

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

tensor([ 0.1158,  0.0266, -0.1454, -0.1183,  0.0824,  0.0964, -0.0587,  0.1547,
        -0.0247, -0.1376, -0.1316, -0.1065, -0.0853, -0.1436,  0.0178,  0.0670,
         0.1633,  0.1156,  0.0214, -0.0359, -0.1628, -0.1722,  0.1088,  0.1614,
         0.0828, -0.0490,  0.0283, -0.1709, -0.1197, -0.0069,  0.0068,  0.1753,
        -0.1133,  0.0627, -0.0942,  0.0265, -0.0919, -0.1255,  0.1599, -0.0419,
        -0.0641,  0.1540, -0.1528,  0.1412,  0.0783,  0.0560, -0.0193, -0.1119,
         0.1042,  0.0335,  0.1507, -0.1181,  0.1835,  0.1532,  0.0682,  0.1418,
         0.1370,  0.0172, -0.1805, -0.1003,  0.1508,  0.1235, -0.0754,  0.0786,
        -0.0759,  0.1337,  0.1521,  0.1123, -0.1392, -0.0181, -0.0108, -0.1767,
        -0.0256, -0.0802, -0.1432,  0.1590, -0.0405, -0.0938,  0.0795,  0.1531,
         0.0783,  0.0219,  0.1157, -0.0415, -0.0862,  0.0917,  0.0542, -0.1832,
         0.1057,  0.0323,  0.0185,  0.0044, -0.0622,  0.1230,  0.1404,  0.0220,
        -0.0044, -0.1165, -0.0451, -0.14

conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Siz

input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torc

conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Siz

input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torc

input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torc

linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output to

conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
input: torch.Size([1, 29])
linear: torch.Siz

In [78]:
test_data

[(['O', 'ronaldinho', 'gaúcho', 'é', 'um', 'cara', 'misterioso', '.'],
  'PORTUGUESE'),
 (['it', 'is', 'lost', 'on', 'me'], 'ENGLISH')]

In [79]:
with torch.no_grad():
    print('EVAL:')
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(torch.argmax(log_probs))

# Index corresponding to Portuguese goes up, English goes down!
print(next(model.parameters())[:, word_to_ix["cara"]])

EVAL:
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
tensor(0)
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])
tensor(1)
tensor([ 0.1403,  0.0765, -0.1410,  0.0054,  0.1467,  0.0964, -0.0587,  0.0905,
        -0.0247, -0.1312, -0.0811,  0.0043, -0.0376, -0.1154,  0.0855,  0.0670,
         0.2005,  0.1902,  0.0642, -0.0359, -0.0708, -0.0598,  0.2556,  0.2551,
         0.1599, -0.1070,  0.0283, -0.1709, -0.0661, -0.0069,  0.0068,  0.1753,
        -0.0231,  0.1047, -0.0838,  0.0330, -0.0919, -0.1255,  0.2241, -0.0419,
         0.0148,  0.1437, -0.1528,  0.1412,  0.1680,  0.1605, -0.0193, -0.1119,
         0.1922,  0.0335,  0.1507, -0.1181,  0.2724,  0.1532,  0.0682,  0.2386,
         0.1370,  0.0172, -0.1805, -0.0592,  0.2622,  0.2096, -0.0

In [99]:
print(list(word_to_ix.keys()))
tmp = make_bow_vector('O legal is not good .'.split(), word_to_ix)
out = model(tmp)
inv_label_to_ix = dict(zip(label_to_ix.values(), label_to_ix.keys()))
inv_label_to_ix[out.argmax().item()]

['O', 'Henrico', 'é', 'um', 'cara', 'legal', 'Give', 'it', 'to', 'me', 'Bolsonaro', 'uma', 'pessoa', 'especial', 'No', 'is', 'not', 'a', 'good', 'idea', 'get', 'lost', 'at', 'sea', 'ronaldinho', 'gaúcho', 'misterioso', '.', 'on']
input: torch.Size([1, 29])
linear: torch.Size([1, 100])
linear: torch.Size([1, 1, 100])
conv: torch.Size([1, 50, 96])
flatted torch.Size([1, 4800])
output torch.Size([1, 2])


'ENGLISH'

## Modelo de língua com a bíblia sagrada


In [105]:
# a biblia tá armazenada em data/biblia-sagrada-pt.txt
!ls ../../data/

biblia-sagrada-pt.txt


In [106]:
import re
from nltk import ToktokTokenizer

tokenizer = ToktokTokenizer()

def read_biblia(fpath):
    text = []
    with open(fpath, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip().lower()
            if line:
                text.append(line)
    return text

dataset = read_biblia('../../data/biblia-sagrada-pt.txt')
print(dataset[:10])

['bíblia sagrada', 'tradução: joão ferreira de almeida', 'edição revista e corrigida', 'antigo testamento', 'gênesis', 'gênesis 1', '1 no princípio criou deus os céus e a terra.', '2 a terra era sem forma e vazia; e havia trevas sobre a face do abismo, mas o espírito de deus pairava sobre a face das águas.', '3 disse deus: haja luz. e houve luz.', '4 viu deus que a luz era boa; e fez separação entre a luz e as trevas.']


In [107]:
dataset = dataset[3:]
print(dataset[:10])

['antigo testamento', 'gênesis', 'gênesis 1', '1 no princípio criou deus os céus e a terra.', '2 a terra era sem forma e vazia; e havia trevas sobre a face do abismo, mas o espírito de deus pairava sobre a face das águas.', '3 disse deus: haja luz. e houve luz.', '4 viu deus que a luz era boa; e fez separação entre a luz e as trevas.', '5 e deus chamou à luz dia, e às trevas noite. e foi a tarde e a manhã, o dia primeiro.', '6 e disse deus: haja um firmamento no meio das águas, e haja separação entre águas e águas.', '7 fez, pois, deus o firmamento, e separou as águas que estavam debaixo do firmamento das que estavam por cima do firmamento. e assim foi.']


In [120]:
tokenized_dataset = []
for text in dataset:
    tokenized_dataset.extend(['<bos>'] + tokenizer.tokenize(text) + ['<eos>'])
print(tokenized_dataset[:30])

['<bos>', 'antigo', 'testamento', '<eos>', '<bos>', 'gênesis', '<eos>', '<bos>', 'gênesis', '1', '<eos>', '<bos>', '1', 'no', 'princípio', 'criou', 'deus', 'os', 'céus', 'e', 'a', 'terra', '.', '<eos>', '<bos>', '2', 'a', 'terra', 'era', 'sem']


Bora implementar um modelo simples de trigramas:

In [222]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 3

# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [(
                [tokenized_dataset[i], tokenized_dataset[i + 1]], 
                 tokenized_dataset[i + 2]
            )
            for i in range(len(tokenized_dataset) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

[(['<bos>', 'antigo'], 'testamento'), (['antigo', 'testamento'], '<eos>'), (['testamento', '<eos>'], '<bos>')]


In [226]:
from collections import Counter
vocab = Counter(tokenized_dataset).most_common(100000)
word_to_ix = {word: i+1 for i, (word, _) in enumerate(vocab)}

print('Dataset size:', len(trigrams))
print('Vocab size:', len(vocab))  # Como diminuir?

Dataset size: 923673
Vocab size: 30462


In [186]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size, hidden_size=30):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_size)
        self.linear2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        x = embeds.view(embeds.shape[0], -1)
        out = F.relu(self.linear1(x))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [227]:
tigrams_batches = []
batch_size = 64
batch = []
for i, (context, target) in enumerate(trigrams):
#     print(context)
    if i % batch_size == 0:
        if len(batch) > 0 :
            tigrams_batches.append(batch)
        batch = []
    batch.append((context, target))

# print(tigrams_batches[:10])


In [181]:
print(list(map(len, tigrams_batches)))

[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,

In [228]:
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab)+1, 10, CONTEXT_SIZE, hidden_size=10)
optimizer = optim.SGD(model.parameters(), lr=0.1)

def get_list_of_ids(context, word_to_ix):
    list_of_ids = []
    for w in context:
        if w in word_to_ix:
            list_of_ids.append(word_to_ix[w])
        else:
            list_of_ids.append(0)
    return list_of_ids

def get_target_id(target, word_to_ix):
    target_word_id = 0
    if target in word_to_ix:
        target_word_id = word_to_ix[target]
    return target_word_id


for epoch in range(1):
    total_loss = 0
    for i, x in enumerate(tigrams_batches):
        
        context, target = zip(*x)
        
        #print(len(context))
        
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        list_of_ids = [get_list_of_ids(c, word_to_ix) for c in context] 
        context_idxs = torch.tensor(list_of_ids, dtype=torch.long) 

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        target_word_ids = torch.tensor([get_target_id(t, word_to_ix) for t in target], dtype=torch.long)
#         print(target_word_ids)
        
        loss = loss_function(log_probs, target_word_ids)

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        print('Epoch %d - Loss (%d/%d) %f' % (epoch+1, i, len(tigrams_batches), total_loss / (i+1)), end='\r')
              
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[84226.86968231201]31/14432) 5.836119


In [217]:
start_context = ['e', 'jesus']
id_to_word = dict(zip(word_to_ix.values(), word_to_ix.keys()))

list_of_ids = get_list_of_ids(['e', 'jesus'], word_to_ix)
context_idxs = torch.tensor([list_of_ids], dtype=torch.long)
_, pred_idx = model(context_idxs).topk(10, dim=-1)
print(pred_idx[0].data)

print([id_to_word[i.item()] for i in pred_idx[0].data if i > 0])

tensor([ 1,  6,  8,  5,  0, 10,  2,  7, 11, 20])
['that', 'of', 'spirits', 'is', 'rules', 'effect,', 'directed', 'programs', 'a']


In [214]:
start_context = ['jesus', 'disse']
generated_text = [start_context[0]]
for i in range(100):
        list_of_ids = get_list_of_ids(start_context, word_to_ix)
        context_idxs = torch.tensor([list_of_ids], dtype=torch.long)
        pred_id = torch.argmax(model(context_idxs))
        
        if pred_id.item() == 0:
            pred_word = '<unk>'
        else:
            pred_word = id_to_word[pred_id.item()]
        start_context = [start_context[1], pred_word]
        generated_text.append(start_context[0])

print(' '.join(generated_text))

jesus disse , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,



## Implementando CBOW

In [197]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])


class CBOW(nn.Module):

    def __init__(self, vocab_size, emb_size):
        self.embeddings = nn.Embedding(vocab_size, emb_size)
        self.lin_out = nn.Linear(emb_size, vocab_size)

    def forward(self, x):
        # (bs, 4, vocab_size) -> (bs, 4, emb_dim)
        x = self.emb(x)
        # (bs, 4, emb_dim) -> (bs, 4*emb_dim)
        x = x.view(x.shape[0], -1)
        # (bs, 4*emb_dim) -> (bs, vocab_size)
        x = self.lin_out(x)
        return torch.log_softmax(x, dim=-1)
        

# create your model and train.  here are some functions to help you make
# the data ready for use by your module


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)  # example

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


tensor([13, 12, 33, 28])

## Skipgram

In [9]:
def get_context_pairs(tokenized_corpus, word2idx, window_size=3):
    idx_pairs = []
    # for each sentence
    for sentence in tokenized_corpus:
        indices = [word2idx[word] for word in sentence]
        # for each word, threated as center word
        for center_word_pos in range(len(indices)):
            # for each window position
            for w in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + w
                # make soure not jump out sentence
                if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                    continue
                context_word_idx = indices[context_word_pos]
                idx_pairs.append((indices[center_word_pos], context_word_idx))
    return idx_pairs

def create_vocab(tokenized_corpus):
    word2idx = {}
    for sent in tmp_corpus:
        for word in sent:
            if word not in word2idx:
                word2idx[word] = len(word2idx)
    return word2idx

In [11]:
tmp_corpus = ['the quick brown fox jumps over the lazy fox !'.split(), 
              'this is just another very cool sentence .'.split()]
tmp_word2idx = create_vocab(tmp_corpus)
pairs_idx = create_skipgrams(tmp_corpus, tmp_word2idx, window_size=5)

In [12]:
# model idea
# (vocab, emb) -> (emb, vocab) -> log softmax -> nll loss