In [2]:
import os
import sys
sys.path.insert(0, "../")

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import random
from tqdm import tqdm_notebook as tqdm

from transformers import BertTokenizer, BertModel

from learning.treelstm.utils import load_word_vectors
from learning.treelstm.trainer import Trainer
from learning.treelstm.metrics import Metrics
from learning.treelstm.model import *
from learning.treelstm.vocab import Vocab
import learning.treelstm.Constants as Constants
from learning.treelstm.dataset import QGDataset
from learning.treelstm.scripts.preprocess_lcquad import build_vocab
from itertools import product

data_path = '../learning/treelstm/data/lc_quad/'
train_path = data_path + 'train/'
dev_path = data_path + 'dev/'
test_path = data_path + 'test/'
checkpoints_path = '../learning/treelstm/checkpoints'

In [23]:
torch.manual_seed(args.seed)
random.seed(args.seed)

In [24]:
torch.get_num_threads()
torch.set_num_threads(2)

In [8]:
class Struct: pass
args = Struct()
args.seed = 41
args.cuda = False
args.batchsize = 20
args.mem_dim = 150
args.hidden_dim = 50
args.num_classes = 2
args.input_dim = 300
args.sparse = False
args.lr = 0.01
args.wd = 1e-4

args.epochs = 15

In [25]:
vocab = Vocab(
    os.path.join(data_path, 'dataset.vocab'),
    [Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]
)

In [26]:
try:
    emb = torch.load('glove_lc_merged_emb.pth')
except:
    emb = torch.Tensor(vocab.size(), 300).normal_(-0.05, 0.05)
    # zero out the embeddings for padding and other special words if they are absent in vocab
    for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]):
        emb[idx].zero_()

    with open('../learning/treelstm/data/glove.840B.300d.txt', 'r') as file:
        for line in tqdm(file):
            contents = line.rstrip('\n').split(' ')
            idx = vocab.getIndex(contents[0])
            if(idx):
                emb[idx] = torch.Tensor(list(map(float, contents[1:])))

    torch.save(emb, 'glove_lc_merged_emb.pth')

In [28]:
train_set = QGDataset(train_path, vocab, BertTokenizer.from_pretrained('bert-base-uncased'), args.num_classes)
dev_set = QGDataset(dev_path, vocab, BertTokenizer.from_pretrained('bert-base-uncased'), args.num_classes)
test_set = QGDataset(test_path, vocab, BertTokenizer.from_pretrained('bert-base-uncased'), args.num_classes)

HBox(children=(IntProgress(value=0, max=7896), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7896), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7896), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2265), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2265), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2265), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1090), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1090), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1090), HTML(value='')))




In [27]:
similarity = DASimilarity(args.mem_dim, args.hidden_dim, args.num_classes)
#similarity = CosSimilarity(1)

In [40]:
model = SimilarityEncodersBERT(vocab.size(), args.input_dim, args.mem_dim, similarity, args.sparse)

In [41]:
criterion = nn.KLDivLoss()
optimizer = torch.optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd)

In [42]:
model.emb.weight.data.copy_(emb)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4421, -0.1307,  0.1318,  ..., -0.6867, -0.5819, -0.6584],
        ...,
        [-0.0736, -0.0562, -0.0973,  ..., -0.0250, -0.0307,  0.0238],
        [ 0.1746,  0.1117,  0.6059,  ..., -0.3987, -0.4725,  0.1404],
        [ 0.2707, -0.0874, -0.3683,  ...,  0.0368, -0.2172, -0.3406]])

In [43]:
trainer = TrainerBERT(args, model, criterion, optimizer)
metrics = Metrics(args.num_classes)

### Treino

In [44]:
for epoch in range(args.epochs):
    train_loss = trainer.train(train_set)
    train_loss, train_pred = trainer.test(train_set)
    print('train_loss:', train_loss)
    print('train_pred:', train_pred)
    checkpoint = {'model': trainer.model.state_dict(), 'optim': trainer.optimizer,
                  'args': args, 'epoch': epoch}
    torch.save(checkpoint, 'checkpoint_bert_' + str(epoch) + '.pth')

HBox(children=(IntProgress(value=0, description='Training epoch 1', max=7896, style=ProgressStyle(description_…

  out = F.log_softmax(self.wp(out))





RuntimeError: [enforce fail at CPUAllocator.cpp:64] . DefaultCPUAllocator: can't allocate memory: you tried to allocate 93763584 bytes. Error code 12 (Cannot allocate memory)


# Codigo

In [39]:
# left - a - sent
# right - b - query

class SimilarityEncodersBERT(nn.Module):
    def __init__(self, vocab_size, in_dim, mem_dim, similarity, sparsity):
        super(SimilarityEncodersBERT, self).__init__()
        self.emb = nn.Embedding(vocab_size, in_dim, padding_idx=Constants.PAD, sparse=sparsity)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.query_treelstm = ChildSumTreeLSTM(in_dim, mem_dim)
        self.bert_converter = nn.Linear(768, mem_dim)
        self.similarity = similarity

    # receber sequencia de indexes para a sentenca
    def forward(self, linputs, rtree, rinputs):
        rinputs = self.emb(rinputs)
        bert_raw = self.bert(linputs.unsqueeze(0))[0][[0],0,:]
        # above: gets the hidden state of the last layer of the stack
        # and get the representation of the [CLS] token.
        bert_reduced = self.bert_converter(bert_raw)
        rstate, rhidden = self.query_treelstm(rtree, rinputs)
        output = self.similarity(bert_reduced, rstate)
        return output

In [36]:
from torch.autograd import Variable as Var
from learning.treelstm.utils import map_label_to_target

class TrainerBERT(object):
    def __init__(self, args, model, criterion, optimizer):
        super(TrainerBERT, self).__init__()
        self.args = args
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.epoch = 0

    # helper function for training
    def train(self, dataset):
        self.model.train()
        self.optimizer.zero_grad()
        loss, k = 0.0, 0
        indices = torch.randperm(len(dataset))
        for idx in tqdm(range(len(dataset)), desc='Training epoch ' + str(self.epoch + 1) + ''):
            lsent, rtree, rsent, label = dataset[indices[idx]]
            linput, rinput = Var(lsent), Var(rsent)
            target = Var(map_label_to_target(label, dataset.num_classes))
            if self.args.cuda:
                linput, rinput = linput.cuda(), rinput.cuda()
                target = target.cuda()  
            output = self.model(linput, rtree, rinput)
            err = self.criterion(output, target)
            loss += err.data.item()
            err.backward()
            k += 1
            if k % self.args.batchsize == 0:
                self.optimizer.step()
                self.optimizer.zero_grad()
        self.epoch += 1
        return loss / len(dataset)

    # helper function for testing
    def test(self, dataset):
        self.model.eval()
        loss = 0
        predictions = torch.zeros(len(dataset))
        indices = torch.arange(1, dataset.num_classes + 1, dtype=torch.float)
        for idx in tqdm(range(len(dataset)), desc='Testing epoch  ' + str(self.epoch) + ''):
            lsent, rtree, rsent, label = dataset[idx]
            linput, rinput = Var(lsent, volatile=True), Var(rsent, volatile=True)
            target = Var(map_label_to_target(label, dataset.num_classes), volatile=True)
            if self.args.cuda:
                linput, rinput = linput.cuda(), rinput.cuda()
                target = target.cuda()
            output = self.model(linput, rtree, rinput)
            err = self.criterion(output, target)
            loss += err.data
            output = output.data.squeeze().cpu()
            predictions[idx] = torch.dot(indices, torch.exp(output))
        return loss / len(dataset), predictions


In [18]:
import json
from copy import deepcopy
import torch.utils.data as data
from transformers import BertTokenizer
from learning.treelstm.tree import Tree

class QGDataset(data.Dataset):
    def __init__(self, path, vocab, bert_tok, num_classes):
        super(QGDataset, self).__init__()
        self.vocab = vocab
        self.bert_tok = bert_tok
        self.num_classes = num_classes

        # Converte os tokens para indices do vocabularios
        self.lsentences = self.read_sentences(os.path.join(path, 'a.txt'), bert=True)
        self.rsentences = self.read_sentences(os.path.join(path, 'b.toks'))
        self.rtrees = self.read_trees(os.path.join(path, 'b.parents'))

        # cria tensor de labels
        self.labels = self.read_labels(os.path.join(path, 'sim.txt'))

        self.size = len(self.lsentences)

    def __len__(self):
        return self.size

    def __getitem__(self, index):
        lsent = deepcopy(self.lsentences[index])
        rtree = deepcopy(self.rtrees[index])
        rsent = deepcopy(self.rsentences[index])
        label = deepcopy(self.labels[index])
        return (lsent, rtree, rsent, label)

    def read_sentences(self, filename, bert=False):
        with open(filename, 'r') as f:
            sentences = [self.read_sentence(line, bert) for line in tqdm(f.readlines())]
        return sentences

    def read_sentence(self, line, bert):
        if(bert):
            indices = self.bert_tok.encode(line, add_special_tokens=True)
        else:
            indices = self.vocab.convertToIdx(line.split(), Constants.UNK_WORD)
        return torch.LongTensor(indices)

    def read_trees(self, filename):
        with open(filename, 'r') as f:
            trees = [self.read_tree(line) for line in tqdm(f.readlines())]
        return trees

    def read_tree(self, line):
        parents = list(map(int, line.split()))
        trees = dict()
        root = None
        for i in range(1, len(parents) + 1):
            if i - 1 not in trees.keys() and parents[i - 1] != -1:
                idx = i
                prev = None
                while True:
                    parent = parents[idx - 1]
                    if parent == -1:
                        break
                    tree = Tree()
                    if prev is not None:
                        tree.add_child(prev)
                    trees[idx - 1] = tree
                    tree.idx = idx - 1
                    if parent - 1 in trees.keys():
                        trees[parent - 1].add_child(tree)
                        break
                    elif parent == 0:
                        root = tree
                        break
                    else:
                        prev = tree
                        idx = parent
        return root

    def read_labels(self, filename):
        with open(filename, 'r') as f:
            labels = list(map(lambda x: float(x), f.readlines()))
            labels = torch.Tensor(labels)
        return labels
