# Notebook exploratório

- Artigo: Train one get one free
- Link: https://arxiv.org/abs/1903.12431

## 0.1 Inicializando o GloVe

In [1]:
import torchtext as tt

In [2]:
glove = tt.vocab.GloVe('6B', dim=50)
glove_vocab = set(glove.stoi.keys())

## 1.1 Carregando dados de uma das bases para exemplo

In [3]:
import pandas as pd

In [4]:
eclipse = pd.read_csv('../../data/processed/eclipse.csv', index_col='bug_id')
eclipse.head()

Unnamed: 0_level_0,short_desc
bug_id,Unnamed: 1_level_1
1,Usability issue with external editors (1GE6IRL)
2,Opening repository resources doesn't honor typ...
3,Sync does not indicate deletion (1GIEN83)
4,need better error message if catching up over ...
5,ISharingManager sharing API inconsistent (1GAU...


## 1.2 Preprocessamento dos dados

In [5]:
remove_nonalpha = lambda word: ''.join([x for x in word if str.isalpha(x) or str.isnumeric(x) or x == ' '])

short_desc = eclipse['short_desc']
short_desc = short_desc.apply(str.lower)
short_desc = short_desc.apply(remove_nonalpha)
short_desc.head()

bug_id
1        usability issue with external editors 1ge6irl
2    opening repository resources doesnt honor type...
3              sync does not indicate deletion 1gien83
4    need better error message if catching up over ...
5     isharingmanager sharing api inconsistent 1gaul8h
Name: short_desc, dtype: object

## 1.3 Explorando vocabulário das descrições

In [6]:
from collections import defaultdict

In [7]:
base_freq = defaultdict(int)
for sentence in short_desc:
    for word in sentence.split():
        base_freq[word] += 1
base_vocab = set(base_freq.keys())
len(base_vocab)

111313

## 1.4 Explorando diferenças entre vocabulário do GloVe e das descrições

In [8]:
print("Número de palavras distintas: ", len(base_vocab - glove_vocab))

Número de palavras distintas:  94763


## 1.5 Removendo palavras menos frequentes do vocabulário

In [9]:
MIN_FREQ = 3

base_freq = {key: value for key, value in base_freq.items() if value > MIN_FREQ}
base_freq
base_vocab = set(base_freq.keys())
print("Número de palavras do vocabulário: ", len(base_vocab))
print("Número de palavras compartilhadas entre os dois vocabulários: ", len(base_vocab & glove_vocab))
print("Número de palavras distintas: ", len(base_vocab - glove_vocab))

Número de palavras do vocabulário:  17026
Número de palavras compartilhadas entre os dois vocabulários:  9193
Número de palavras distintas:  7833


## 1.6 Obtendo pesos das palavras existentes no vocabulário do GloVe

* Obtendo palavras que estão dentro do vocabulário do GloVe
* Extraindo o STOI das palavras que estão apenas no vocabulário
* Obtendo os vetores correspondentes as palavras pertencentes ao vocabulário
* Gerando um novo STOI iniciado de 0 usando apenas as palavras existentes no vocabulário

In [10]:
import torch

In [11]:
in_vocab = base_vocab & glove_vocab
in_size = len(in_vocab)

in_stoi = {x: glove.stoi[x] for x in in_vocab}
in_stoi = {k: v for k, v in sorted(in_stoi.items(), key=lambda x:x[1])}
in_vectors = glove.vectors[list(in_stoi.values())]
in_stoi = {key: value for key, value in zip(in_stoi.keys(), range(in_size))}

## 1.7 Gerando pesos para palavras fora do vocabulário do GloVe

* Obtem as palavras que não estão no vocabulário
* Gera um vetor de zeros com o shape `[n_palavras_fora_vocab, shape_do_glove]`
* Utiliza o vetor de zeros pra gerar um tensor que segue distribuição normal
* Gera um STOI para continuar o STOI anterior

In [12]:
glove_shape = glove.vectors.shape[-1]

out_vocab = sorted(list(base_vocab - glove_vocab))
out_size = len(out_vocab)

zeros = torch.zeros((out_size, glove_shape))
out_vectors = torch.normal(mean=zeros, std=1)

out_stoi = {key: value for key, value in zip(out_vocab, range(in_size, in_size + out_size))}

## 1.8 Gerando pesos para padding

In [95]:
glove_shape = glove.vectors.shape[-1]
zero_vectors = torch.zeros((1, glove_shape))
zero_stoi = {"<PAD>": in_size + out_size}
zero_id = zero_stoi['<PAD>']

## 1.8 Unindo representações

* Primeiros elementos são os caras presentes no GloVe
* Próximos elementos são os que não estão presentes no GloVe
* Seguido por um elemento que representa padding

In [14]:
base_vectors = torch.cat((in_vectors, out_vectors, zero_vectors))
base_stoi = {**in_stoi, **out_stoi, **zero_stoi}

print("Tamanho do vocabulário final: ", len(base_stoi))
print("Shape do vetor final: ", base_vectors.shape)

Tamanho do vocabulário final:  17027
Shape do vetor final:  torch.Size([17027, 50])


## 2.1 Implementando tokenizador

In [33]:
class GloVeTokenizer(object):
    def __init__(self, stoi, max_size):
        self.stoi = stoi
        self.max_sentence_size = max_size
        self.PADDING_CHAR = '<PAD>'
    def tokenize(self, sentences):
        max_sentence_size = self.max_sentence_size
        tokenized_sentences = list()
        for sentence in sentences:
            tok_index = 0
            tokenized_sentence = list()
            for word in sentence.split():
                if word in self.stoi and tok_index < max_sentence_size:
                    tokenized_sentence.append([self.stoi[word]])
                    tok_index += 1
            for i in range(tok_index, max_sentence_size):
                tokenized_sentence.append([self.stoi[self.PADDING_CHAR]])
            tokenized_sentences.append(tokenized_sentence)
        return torch.LongTensor(tokenized_sentences)
    def tokens_to_onehot(self, tokenized_sentences):
        # one liner da internet
        # https://stackoverflow.com/questions/36960320/convert-a-2d-matrix-to-a-3d-one-hot-matrix-numpy
        input_tensors = (torch.arange(tokenized_sentences.max() + 1) == tokenized_sentences[..., None]).int().squeeze()
        return input_tensors

In [34]:
glove_tok = GloVeTokenizer(base_stoi, max_size=300)
tokenized_words = glove_tok.tokenize(short_desc[:10])
onehot_words = glove_tok.tokens_to_onehot(tokenized_words)

In [35]:
tokenized_words.shape, onehot_words.shape

(torch.Size([10, 300, 1]), torch.Size([10, 300, 17027]))

## 3.1 Implementando rede

In [39]:
import pytorch_lightning as pl

In [108]:
class BugSimilarityModel(pl.LightningModule):
    def __init__(self, word_embedding, embed_dim):
        super().__init__()
        self.word_embedding = torch.nn.Linear(in_features=word_embedding.shape[0], out_features=word_embedding.shape[-1], bias=False)
        self.word_embedding.weights = word_embedding
        self.bi_gru = torch.nn.GRU(input_size=word_embedding.shape[-1], hidden_size=embed_dim, batch_first=True, bidirectional=True)
        self.self_attention = torch.nn.MultiheadAttention(2*embed_dim, num_heads=1)
        self.conditional_attention = torch.nn.MultiheadAttention(embed_dim, num_heads=1)
        self.mlp = torch.nn.Linear(embed_dim, 100)
    def forward(self, P, Q):
        # Report P
        p = self.word_embedding(P)
        p_output, p_h_n = self.bi_gru(p)
        p_output = p_output.permute(1, 0, 2)
        print(p_output.shape)
        theta_p = self.self_attention(p_output, p_output, p_output)

        # Report Q
        #q = self.word_embedding(Q)
        #q_output, q_h_n = self.bi_gru(q)
        #q_output = q_output.permute(1, 0, 2)
        #theta_q = self.self_attention(q_output, q_output, q_output)

        return theta_p#, #theta_q

        return gru_output_forward, gru_output_backward,h_forward, h_backward
    def training_step(self, batch, batch_idx):
        pass
    def configure_optimizers(self):
        pass

In [109]:
model = BugSimilarityModel(base_vectors, 100)

In [110]:
model.forward(onehot_words.float(), None)

torch.Size([300, 10, 200])


(tensor([[[ 0.0204,  0.0134, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          [ 0.0204,  0.0133, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          [ 0.0204,  0.0133, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          ...,
          [ 0.0204,  0.0133, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          [ 0.0204,  0.0134, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          [ 0.0204,  0.0133, -0.0420,  ...,  0.0397, -0.0244,  0.0028]],
 
         [[ 0.0204,  0.0134, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          [ 0.0204,  0.0133, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          [ 0.0204,  0.0133, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          ...,
          [ 0.0204,  0.0133, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          [ 0.0204,  0.0134, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          [ 0.0204,  0.0133, -0.0420,  ...,  0.0397, -0.0244,  0.0028]],
 
         [[ 0.0204,  0.0134, -0.0420,  ...,  0.0397, -0.0244,  0.0028],
          [ 0.0204,  0.0133,