# NLP

## Classificando documentos usando BoW

Primeiro vamos dar uma revisada no softmax pra ver se entedemos bem:

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)


# Softmax is also in torch.nn.functional
data = torch.randn(5)
print(data)
print(F.softmax(data, dim=0))
print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!
print(F.log_softmax(data, dim=0))  # theres also log_softmax

tensor([ 0.6614,  0.2669,  0.0617,  0.6213, -0.4519])
tensor([0.2847, 0.1919, 0.1563, 0.2735, 0.0935])
tensor(1.)
tensor([-1.2563, -1.6507, -1.8559, -1.2963, -2.3695])


In [3]:
data = [("O Henrico é um cara legal".split(), "PORTUGUESE"),
        ("Give it to me".split(), "ENGLISH"),
        ("O Bolsonaro é uma pessoa especial".split(), "PORTUGUESE"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "PORTUGUESE"),
             ("it is lost on me".split(), "ENGLISH")]

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
label_to_ix = {"PORTUGUESE": 0, "ENGLISH": 1}

for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2


class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec), dim=1)


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)


def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])


model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the PyTorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print(param)

# To run the model, pass in a BoW vector
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    sample = data[0]
    bow_vector = make_bow_vector(sample[0], word_to_ix)
    log_probs = model(bow_vector)
    print(log_probs)

{'O': 0, 'Henrico': 1, 'é': 2, 'um': 3, 'cara': 4, 'legal': 5, 'Give': 6, 'it': 7, 'to': 8, 'me': 9, 'Bolsonaro': 10, 'uma': 11, 'pessoa': 12, 'especial': 13, 'No': 14, 'is': 15, 'not': 16, 'a': 17, 'good': 18, 'idea': 19, 'get': 20, 'lost': 21, 'at': 22, 'sea': 23, 'Yo': 24, 'creo': 25, 'que': 26, 'si': 27, 'on': 28}
Parameter containing:
tensor([[ 0.0678, -0.0724, -0.0135, -0.0167,  0.0269, -0.0007,  0.1623,  0.0578,
         -0.0692, -0.1122, -0.0311, -0.0801, -0.0595,  0.0089,  0.1107,  0.1009,
         -0.1815,  0.1151,  0.0519,  0.1761,  0.1226, -0.1692, -0.1766, -0.0896,
          0.1631, -0.0309,  0.0795, -0.0863,  0.1822],
        [-0.0786,  0.1393,  0.0022, -0.0978,  0.0955, -0.0986,  0.0546, -0.0536,
         -0.0204, -0.1785, -0.0885,  0.1008, -0.0451,  0.1850,  0.1489, -0.0087,
         -0.1239,  0.1131,  0.0576, -0.1200,  0.1206,  0.1127,  0.1647, -0.1041,
         -0.0306, -0.0036,  0.0271, -0.1409, -0.1318]], requires_grad=True)
Parameter containing:
tensor([ 0.1010, -0

Nossa entrada é um bag of words, então 

    doc_0 = [count(v_0), count(v_1), count(v_2), ..., count(v_n)]
    doc_1 = [count(v_0), count(v_1), count(v_2), ..., count(v_n)]
    
Nossa rede neural é uma camda linear seguida por um log_softmax (é uma regressão logística!): 

    net = log_softmax(linear(doc))

Bora treinar uma pouco essa rede

In [6]:
# Run on test data before we train, just to see a before-and-after
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

# Print the matrix column corresponding to "cara"
print(next(model.parameters())[:, word_to_ix["cara"]])

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(100):
    for instance, label in data:
        # Step 1. Remember that PyTorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Tensor as an integer. For example, if the target is PORTUGUESE, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to PORTUGUESE
        bow_vec = make_bow_vector(instance, word_to_ix)
        target = make_target(label, label_to_ix)

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

# Index corresponding to Portuguese goes up, English goes down!
print(next(model.parameters())[:, word_to_ix["cara"]])

tensor([[-0.5455, -0.8665]])
tensor([[-3.1106, -0.0456]])
tensor([ 0.3764, -0.2540], grad_fn=<SelectBackward>)
tensor([[-0.5481, -0.8629]])
tensor([[-3.3447, -0.0359]])
tensor([ 0.4019, -0.2795], grad_fn=<SelectBackward>)


## Modelo de língua com a bíblia sagrada


In [10]:
# a biblia tá armazenada em data/biblia-sagrada-pt.txt
!ls ../../data/

biblia-sagrada-pt.txt


In [20]:
import re
from nltk import ToktokTokenizer

tokenizer = ToktokTokenizer()

def read_biblia(fpath):
    text = []
    with open(fpath, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip().lower()
            if line:
                text.append(line)
    return text

dataset = read_biblia('../../data/biblia-sagrada-pt.txt')
print(dataset[:10])

['bíblia sagrada', 'tradução: joão ferreira de almeida', 'edição revista e corrigida', 'antigo testamento', 'gênesis', 'gênesis 1', '1 no princípio criou deus os céus e a terra.', '2 a terra era sem forma e vazia; e havia trevas sobre a face do abismo, mas o espírito de deus pairava sobre a face das águas.', '3 disse deus: haja luz. e houve luz.', '4 viu deus que a luz era boa; e fez separação entre a luz e as trevas.']


In [21]:
dataset = dataset[3:]
print(dataset[:10])

['antigo testamento', 'gênesis', 'gênesis 1', '1 no princípio criou deus os céus e a terra.', '2 a terra era sem forma e vazia; e havia trevas sobre a face do abismo, mas o espírito de deus pairava sobre a face das águas.', '3 disse deus: haja luz. e houve luz.', '4 viu deus que a luz era boa; e fez separação entre a luz e as trevas.', '5 e deus chamou à luz dia, e às trevas noite. e foi a tarde e a manhã, o dia primeiro.', '6 e disse deus: haja um firmamento no meio das águas, e haja separação entre águas e águas.', '7 fez, pois, deus o firmamento, e separou as águas que estavam debaixo do firmamento das que estavam por cima do firmamento. e assim foi.']


In [24]:
tokenized_dataset = []
for text in dataset:
    tokenized_dataset.extend(tokenizer.tokenize(text))
print(tokenized_dataset[:30])

['antigo', 'testamento', 'gênesis', 'gênesis', '1', '1', 'no', 'princípio', 'criou', 'deus', 'os', 'céus', 'e', 'a', 'terra', '.', '2', 'a', 'terra', 'era', 'sem', 'forma', 'e', 'vazia', ';', 'e', 'havia', 'trevas', 'sobre', 'a']


Bora implementar um modelo simples de trigramas:

In [None]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([tokenized_dataset[i], tokenized_dataset[i + 1]], tokenized_dataset[i + 2])
            for i in range(len(tokenized_dataset) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])



vocab = set(tokenized_dataset)
word_to_ix = {word: i for i, word in enumerate(vocab)}

print('Dataset size:', len(trigrams))
print('Vocab size:', len(vocab))  # Como diminuir?

class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 30)
        self.linear2 = nn.Linear(30, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(5):
    total_loss = 0
    for i, (context, target) in enumerate(trigrams):

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_idxs)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        print('Epoch %d - Loss %f' % (epoch+1, total_loss / (i+1)), end='\r')
              
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[(['antigo', 'testamento'], 'gênesis'), (['testamento', 'gênesis'], 'gênesis'), (['gênesis', 'gênesis'], '1')]
Dataset size: 858957
Vocab size: 30460
Epoch 1 - Loss 6.4657529

## Implementando CBOW

In [None]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])


class CBOW(nn.Module):

    def __init__(self):
        pass

    def forward(self, inputs):
        pass

# create your model and train.  here are some functions to help you make
# the data ready for use by your module


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)  # example