In [1]:
import torchtext
from torchtext import data
from torchtext import datasets
import spacy

In [2]:
TEXT = datasets.snli.ParsedTextField(lower=True)

LABEL = data.Field(sequential=False)

In [91]:
train, val, test = datasets.SNLI.splits(TEXT, LABEL)

In [92]:
print(train.fields)
print(len(train))
print(vars(train[0]))

{'premise': <torchtext.datasets.snli.ParsedTextField object at 0x1150bfba8>, 'hypothesis': <torchtext.datasets.snli.ParsedTextField object at 0x1150bfba8>, 'label': <torchtext.data.field.Field object at 0x11508b940>}
549367
{'premise': ['a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane.'], 'hypothesis': ['a', 'person', 'is', 'training', 'his', 'horse', 'for', 'a', 'competition.'], 'label': 'neutral'}


In [93]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [101]:
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), batch_size=5, device=-1)

In [102]:
len(train_iter)

109874

In [104]:
batch = next(iter(train_iter))

In [105]:
batch.batch_size

5

In [213]:
batch.premise.size()

torch.Size([16, 5])

In [107]:
print(batch.premise)

Variable containing:
     1      1      1      1      1
  1101      1      1      1      1
    59      1      1      1      1
   116      1      1      1      1
    10    444   2633   3343    445
    40     19     47   1646    417
     4      4    329     71     15
   416     95  10717    251   1057
  1420      2     10     60    373
   891     11    887    897      3
    84     82      2   2215     15
    14      3     11      2    277
    20      4     28     17     15
    10   2194      5      5    157
    33     29     12     12     76
     2      2      2      3     13
[torch.LongTensor of size 16x5]



In [109]:
print(batch.label)

Variable containing:
 1
 1
 3
 3
 3
[torch.LongTensor of size 5]



In [189]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import logging
import torch.optim as optim
import time
import numpy as np
import sys
import argparse
from random import shuffle

In [186]:
class encoder(nn.Module):

    def __init__(self, num_embeddings, embedding_size, hidden_size, para_init):
        super(encoder, self).__init__()

        self.num_embeddings = num_embeddings
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.para_init = para_init

        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_size)
        self.input_linear = nn.Linear(self.embedding_size, self.hidden_size, bias=False)
        
        # input_linear initialize a normal weught for each input
        for m in self.modules():
            # check if module is torch.nn.modules.linear.Linear, which is used to select input_linear
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0, self.para_init)
                # m.bias.data.uniform_(-0.01, 0.01)

    def forward(self, sent1, sent2):
        '''
        sent: batch_size x length (Long tensor)
        '''
        batch_size = sent1.size(0)
        sent1 = self.embedding(sent1)
        sent2 = self.embedding(sent2)

        sent1 = sent1.view(-1, self.embedding_size)
        sent2 = sent2.view(-1, self.embedding_size)

        sent1_linear = self.input_linear(sent1).view(batch_size, -1, self.hidden_size)
        sent2_linear = self.input_linear(sent2).view(batch_size, -1, self.hidden_size)

        return sent1_linear, sent2_linear

class atten(nn.Module):
    '''
        intra sentence attention
    '''

    def __init__(self, hidden_size, label_size, para_init):
        super(atten, self).__init__()

        self.hidden_size = hidden_size
        self.label_size = label_size
        self.para_init = para_init

        self.mlp_f = self._mlp_layers(self.hidden_size, self.hidden_size)
        self.mlp_g = self._mlp_layers(2 * self.hidden_size, self.hidden_size)
        self.mlp_h = self._mlp_layers(2 * self.hidden_size, self.hidden_size)

        self.final_linear = nn.Linear(
            self.hidden_size, self.label_size, bias=True)

        self.log_prob = nn.LogSoftmax()

        '''initialize parameters'''
        for m in self.modules():
            # print m
            if isinstance(m, nn.Linear):
                m.weight.data.normal_(0, self.para_init)
                m.bias.data.normal_(0, self.para_init)

    def _mlp_layers(self, input_dim, output_dim):
        mlp_layers = []
        mlp_layers.append(nn.Dropout(p=0.2))
        mlp_layers.append(nn.Linear(
            input_dim, output_dim, bias=True))
        mlp_layers.append(nn.ReLU())
        mlp_layers.append(nn.Dropout(p=0.2))
        mlp_layers.append(nn.Linear(
            output_dim, output_dim, bias=True))
        mlp_layers.append(nn.ReLU())        
        return nn.Sequential(*mlp_layers)   # * used to unpack list

    def forward(self, sent1_linear, sent2_linear):
        '''
            sent_linear: batch_size x length x hidden_size
        '''
        len1 = sent1_linear.size(1)
        len2 = sent2_linear.size(1)

        '''attend'''

        f1 = self.mlp_f(sent1_linear.view(-1, self.hidden_size))
        f2 = self.mlp_f(sent2_linear.view(-1, self.hidden_size))

        f1 = f1.view(-1, len1, self.hidden_size)
        # batch_size x len1 x hidden_size
        f2 = f2.view(-1, len2, self.hidden_size)
        # batch_size x len2 x hidden_size

        score1 = torch.bmm(f1, torch.transpose(f2, 1, 2))
        # e_{ij} batch_size x len1 x len2
        prob1 = F.softmax(score1.view(-1, len2)).view(-1, len1, len2)
        # batch_size x len1 x len2

        score2 = torch.transpose(score1.contiguous(), 1, 2)
        score2 = score2.contiguous()
        # e_{ji} batch_size x len2 x len1
        prob2 = F.softmax(score2.view(-1, len1)).view(-1, len2, len1)
        # batch_size x len2 x len1

        sent1_combine = torch.cat(
            (sent1_linear, torch.bmm(prob1, sent2_linear)), 2)
        # batch_size x len1 x (hidden_size x 2)
        sent2_combine = torch.cat(
            (sent2_linear, torch.bmm(prob2, sent1_linear)), 2)
        # batch_size x len2 x (hidden_size x 2)

        '''sum'''
        g1 = self.mlp_g(sent1_combine.view(-1, 2 * self.hidden_size))
        g2 = self.mlp_g(sent2_combine.view(-1, 2 * self.hidden_size))
        g1 = g1.view(-1, len1, self.hidden_size)
        # batch_size x len1 x hidden_size
        g2 = g2.view(-1, len2, self.hidden_size)
        # batch_size x len2 x hidden_size

        sent1_output = torch.sum(g1, 1)  # batch_size x 1 x hidden_size
        sent1_output = torch.squeeze(sent1_output, 1)
        sent2_output = torch.sum(g2, 1)  # batch_size x 1 x hidden_size
        sent2_output = torch.squeeze(sent2_output, 1)

        input_combine = torch.cat((sent1_output, sent2_output), 1)
        # batch_size x (2 * hidden_size)
        h = self.mlp_h(input_combine)
        # batch_size * hidden_size

        # if sample_id == 15:
        #     print '-2 layer'
        #     print h.data[:, 100:150]

        h = self.final_linear(h)

        # print 'final layer'
        # print h.data

        log_prob = self.log_prob(h)

        return log_prob

In [266]:
max_lenght = 10
embedding_size = 50
hidden_size = 300
para_init = 0.01  
train_lbl_size = 3
weight_decay = 5e-5 #l2 regularization
learning_rate = 0.05
epoch = 250

In [245]:
# import pretrained word vectors from glove

glove_home = './glove_6B/'
words_to_load = 50000

import numpy as np

# pre-trained word vectors
with open(glove_home + 'glove.6B.50d.txt') as f:
    word_vecs = np.zeros((words_to_load, embedding_size)) #dim: (50000, 50)
    words = {}
    idx2words = {}
    ordered_words = []
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        
        word_vecs[i, :] = np.asarray(s[1:])
        words[s[0]] = i
        idx2words[i] = s[0]
        ordered_words.append(s[0])

In [246]:
word_vecs = torch.from_numpy(word_vecs)

In [247]:
word_vecs.size()

torch.Size([50000, 50])

In [248]:
input_encoder = encoder(word_vecs.size(0), embedding_size, hidden_size, para_init)

In [251]:
input_encoder.embedding.weight.data.copy_(word_vecs)
input_encoder.embedding.weight.requires_grad = False
inter_atten = atten(hidden_size, train_lbl_size, para_init)

In [253]:
para1 = filter(lambda p: p.requires_grad, input_encoder.parameters())
para2 = inter_atten.parameters()

In [259]:
input_optimizer = optim.Adagrad(para1, learning_rate, weight_decay)
inter_atten_optimizer = optim.Adagrad(para2, learning_rate, weight_decay)

In [262]:
criterion = nn.CrossEntropyLoss()

In [269]:
train_batches  = next(iter(train_iter))

In [270]:
#train
for k in range(epoch):
    total = 0.
    correct = 0.
    loss_data = 0.
    train_sents = 0.

    shuffle(train_batches)
    timer = time.time()
    for i in range(len(train_batches)):

        train_src_batch, train_tgt_batch, train_lbl_batch = train_batches[i]

        train_src_batch = Variable(train_src_batch)
        train_tgt_batch = Variable(train_tgt_batch)
        train_lbl_batch = Variable(train_lbl_batch)

        batch_size = train_src_batch.size(0)
        train_sents += batch_size

        input_optimizer.zero_grad()
        inter_atten_optimizer.zero_grad()

        # initialize the optimizer
        if k == 0 and optim == 'Adagrad':
            for group in input_optimizer.param_groups:
                for p in group['params']:
                    state = input_optimizer.state[p]
                    state['sum'] += args.Adagrad_init
            for group in inter_atten_optimizer.param_groups:
                for p in group['params']:
                    state = inter_atten_optimizer.state[p]
                    state['sum'] += args.Adagrad_init

        train_src_linear, train_tgt_linear = input_encoder(
            train_src_batch, train_tgt_batch)
        log_prob = inter_atten(train_src_linear, train_tgt_linear)

        loss = criterion(log_prob, train_lbl_batch)

        loss.backward()

        grad_norm = 0.
        para_norm = 0.

        for m in input_encoder.modules():
            if isinstance(m, nn.Linear):
                grad_norm += m.weight.grad.data.norm() ** 2
                para_norm += m.weight.data.norm() ** 2
                if m.bias:
                    grad_norm += m.bias.grad.data.norm() ** 2
                    para_norm += m.bias.data.norm() ** 2

        for m in inter_atten.modules():
            if isinstance(m, nn.Linear):
                grad_norm += m.weight.grad.data.norm() ** 2
                para_norm += m.weight.data.norm() ** 2
                if m.bias:
                    grad_norm += m.bias.grad.data.norm() ** 2
                    para_norm += m.bias.data.norm() ** 2

        grad_norm ** 0.5
        para_norm ** 0.5

        shrinkage = args.max_grad_norm / grad_norm
        if shrinkage < 1 :
            for m in input_encoder.modules():
                # print m
                if isinstance(m, nn.Linear):
                    m.weight.grad.data = m.weight.grad.data * shrinkage
            for m in inter_atten.modules():
                # print m
                if isinstance(m, nn.Linear):
                    m.weight.grad.data = m.weight.grad.data * shrinkage
                    m.bias.grad.data = m.bias.grad.data * shrinkage

        input_optimizer.step()
        inter_atten_optimizer.step()

        _, predict = log_prob.data.max(dim=1)
        total += train_lbl_batch.data.size()[0]
        correct += torch.sum(predict == train_lbl_batch.data)
        loss_data += (loss.data[0] * batch_size)  # / train_lbl_batch.data.size()[0])

        if (i + 1) % args.display_interval == 0:
            logger.info('epoch %d, batches %d|%d, train-acc %.3f, loss %.3f, para-norm %.3f, grad-norm %.3f, time %.2fs, ' %
                        (k, i + 1, len(train_batches), correct / total,
                         loss_data / train_sents, para_norm, grad_norm, time.time() - timer))
            train_sents = 0.
            timer = time.time()
            loss_data = 0.
            correct = 0.
            total = 0.
        if i == len(train_batches) - 1:
            logger.info('epoch %d, batches %d|%d, train-acc %.3f, loss %.3f, para-norm %.3f, grad-norm %.3f, time %.2fs, ' %
                        (k, i + 1, len(train_batches), correct / total,
                         loss_data / train_sents, para_norm, grad_norm, time.time() - timer))
            train_sents = 0.
            timer = time.time()
            loss_data = 0.
            correct = 0.
            total = 0.           

    # evaluate
    if (k + 1) % args.dev_interval == 0:
        input_encoder.eval()
        inter_atten.eval()
        correct = 0.
        total = 0.
        for i in range(len(dev_batches)):
            dev_src_batch, dev_tgt_batch, dev_lbl_batch = dev_batches[i]

            dev_src_batch = Variable(dev_src_batch.cuda())
            dev_tgt_batch = Variable(dev_tgt_batch.cuda())
            dev_lbl_batch = Variable(dev_lbl_batch.cuda())

            # if dev_lbl_batch.data.size(0) == 1:
            #     # simple sample batch
            #     dev_src_batch=torch.unsqueeze(dev_src_batch, 0)
            #     dev_tgt_batch=torch.unsqueeze(dev_tgt_batch, 0)

            dev_src_linear, dev_tgt_linear=input_encoder(
                dev_src_batch, dev_tgt_batch)
            log_prob=inter_atten(dev_src_linear, dev_tgt_linear)

            _, predict=log_prob.data.max(dim=1)
            total += dev_lbl_batch.data.size()[0]
            correct += torch.sum(predict == dev_lbl_batch.data)

        dev_acc = correct / total
        logger.info('dev-acc %.3f' % (dev_acc))

        if (k + 1) / args.dev_interval == 1:
            model_fname = '%s%s_epoch-%d_dev-acc-%.3f' %(args.model_path, args.log_fname.split('.')[0], k, dev_acc)
            torch.save(input_encoder.state_dict(), model_fname + '_input-encoder.pt')
            torch.save(inter_atten.state_dict(), model_fname + '_inter-atten.pt')
            best_dev.append((k, dev_acc, model_fname))
            logger.info('current best-dev:')
            for t in best_dev:
                logger.info('\t%d %.3f' %(t[0], t[1]))
            logger.info('save model!') 
        else:
            if dev_acc > best_dev[-1][1]:
                model_fname = '%s%s_epoch-%d_dev-acc-%.3f' %(args.model_path, args.log_fname.split('.')[0], k, dev_acc)
                torch.save(input_encoder.state_dict(), model_fname + '_input-encoder.pt')
                torch.save(inter_atten.state_dict(), model_fname + '_inter-atten.pt')
                best_dev.append((k, dev_acc, model_fname))
                logger.info('current best-dev:')
                for t in best_dev:
                    logger.info('\t%d %.3f' %(t[0], t[1]))
                logger.info('save model!') 

        input_encoder.train()
        inter_atten.train()



TypeError: object of type 'Batch' has no len()

In [None]:
# test
best_model_fname = best_dev[-1][2]
input_encoder.load_state_dict(torch.load(best_model_fname + '_input-encoder.pt'))
inter_atten.load_state_dict(torch.load(best_model_fname + '_inter-atten.pt'))

input_encoder.eval()
inter_atten.eval()

correct = 0.
total = 0.

for i in range(len(test_batches)):
    test_src_batch, test_tgt_batch, test_lbl_batch = test_batches[i]

    test_src_batch = Variable(test_src_batch.cuda())
    test_tgt_batch = Variable(test_tgt_batch.cuda())
    test_lbl_batch = Variable(test_lbl_batch.cuda())

    test_src_linear, test_tgt_linear=input_encoder(
        test_src_batch, test_tgt_batch)
    log_prob=inter_atten(test_src_linear, test_tgt_linear)

    _, predict=log_prob.data.max(dim=1)
    total += test_lbl_batch.data.size()[0]
    correct += torch.sum(predict == test_lbl_batch.data)

test_acc = correct / total