In [1]:
import argparse

import torch
from torch import nn
from torch.autograd import Variable

from model.BIMPM import BIMPM
from model.utils import SNLI, Quora


def test(model, args, data, mode='test'):
    if mode == 'dev':
        iterator = iter(data.dev_iter)
    else:
        iterator = iter(data.test_iter)

    criterion = nn.CrossEntropyLoss()
    model.eval()
    acc, loss, size = 0, 0, 0

    for batch in iterator:
        if args.data_type == 'SNLI':
            s1, s2 = 'premise', 'hypothesis'
        else:
            s1, s2 = 'q1', 'q2'

        s1, s2 = getattr(batch, s1), getattr(batch, s2)
        kwargs = {'p': s1, 'h': s2}

        if args.use_char_emb:
            char_p = Variable(torch.LongTensor(data.characterize(s1)))
            char_h = Variable(torch.LongTensor(data.characterize(s2)))

            if args.gpu > -1:
                char_p = char_p.cuda()
                char_h = char_h.cuda()

            kwargs['char_p'] = char_p
            kwargs['char_h'] = char_h

        pred = model(**kwargs)
        pred = pred.view(-1,2)
        batch_loss = criterion(pred, batch.label)
        loss += batch_loss.data[0]

        _, pred = pred.max(dim=1)
        acc += (pred == batch.label).sum().float()
        size += len(pred)

    acc /= size
    acc = acc.cpu().data[0]
    return loss, acc


def load_model(args, data):
    model = BIMPM(args, data)
    model.load_state_dict(torch.load(args.model_path))

    if args.gpu > -1:
        model.cuda()

    return model




In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):
    r"""
    Applies an attention mechanism on the output features from the decoder.
    .. math::
            \begin{array}{ll}
            x = context*output \\
            attn = exp(x_i) / sum_j exp(x_j) \\
            output = \tanh(w * (attn * context) + b * output)
            \end{array}
    Args:
        dim(int): The number of expected features in the output
    Inputs: output, context
        - **output** (batch, output_len, dimensions): tensor containing the output features from the decoder.
        - **context** (batch, input_len, dimensions): tensor containing features of the encoded input sequence.
    Outputs: output, attn
        - **output** (batch, output_len, dimensions): tensor containing the attended output features from the decoder.
        - **attn** (batch, output_len, input_len): tensor containing attention weights.
    Attributes:
        linear_out (torch.nn.Linear): applies a linear transformation to the incoming data: :math:`y = Ax + b`.
        mask (torch.Tensor, optional): applies a :math:`-inf` to the indices specified in the `Tensor`.
    Examples::
         >>> attention = seq2seq.models.Attention(256)
         >>> context = Variable(torch.randn(5, 3, 256))
         >>> output = Variable(torch.randn(5, 5, 256))
         >>> output, attn = attention(output, context)
    """
    def __init__(self, dim):
        super(Attention, self).__init__()
        self.linear_out = nn.Linear(dim*2, dim)
        self.mask = None

    def set_mask(self, mask):
        """
        Sets indices to be masked
        Args:
            mask (torch.Tensor): tensor containing indices to be masked
        """
        self.mask = mask

    def forward(self, output, context):
        batch_size = output.size(0)
        hidden_size = output.size(2)
        input_size = context.size(1)
        # (batch, out_len, dim) * (batch, in_len, dim) -> (batch, out_len, in_len)
        attn = torch.bmm(output, context.transpose(1, 2))
        if self.mask is not None:
            attn.data.masked_fill_(self.mask, -float('inf'))
        attn = F.softmax(attn.view(-1, input_size)).view(batch_size, -1, input_size)

        # (batch, out_len, in_len) * (batch, in_len, dim) -> (batch, out_len, dim)
        mix = torch.bmm(attn, context)

        # concat -> (batch, out_len, 2*dim)
        combined = torch.cat((mix, output), dim=2)
        # output -> (batch, out_len, dim)
        output = F.tanh(self.linear_out(combined.view(-1, 2 * hidden_size))).view(batch_size, -1, hidden_size)

        return output, attn
    
class Siamese(nn.Module):
    def __init__(self, args, data, use_attention = False):
        super(Siamese, self).__init__()

        self.args = args
        self.d = self.args.word_dim + int(self.args.use_char_emb) * self.args.char_hidden_size
        self.l = self.args.num_perspective

        # ----- Word Representation Layer -----
        self.char_emb = nn.Embedding(args.char_vocab_size, args.char_dim, padding_idx=0)

        self.word_emb = nn.Embedding(args.word_vocab_size, args.word_dim)
        # initialize word embedding with GloVe
        self.word_emb.weight.data.copy_(data.TEXT.vocab.vectors)
        # no fine-tuning for word vectors
        self.word_emb.weight.requires_grad = False
        self.trainingtype = args.training
        self.use_attention = use_attention
        if self.use_attention:
            self.attention = Attention(self.args.hidden_size*2)
        
        self.char_LSTM = nn.LSTM(
            input_size=self.args.char_dim,
            hidden_size=self.args.char_hidden_size,
            num_layers=1,
            bidirectional=False,
            batch_first=True)

        # ----- Context Representation Layer -----
        self.context_LSTM = nn.LSTM(
            input_size=self.d,
            hidden_size=self.args.hidden_size,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )
        self.aggregation_LSTM = nn.LSTM(
            input_size=self.args.hidden_size*2,
            hidden_size=self.args.hidden_size,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )

        self.Ws = nn.Parameter(torch.rand(self.args.hidden_size*2,self.args.hidden_size*2))
        self.Us = nn.Parameter(torch.rand(self.args.hidden_size*2,self.args.hidden_size*2))
        self.bs = nn.Parameter(torch.rand(self.args.hidden_size*2))
        
        # ----- Prediction Layer -----
        self.pred_fc1 = nn.Linear(self.args.hidden_size * 4, self.args.hidden_size * 2)
        self.pred_fc2 = nn.Linear(self.args.hidden_size * 2, self.args.class_size)

        self.reset_parameters()

    def reset_parameters(self):
        # ----- Word Representation Layer -----
        nn.init.uniform(self.char_emb.weight, -0.005, 0.005)
        # zero vectors for padding
        self.char_emb.weight.data[0].fill_(0)

        # <unk> vectors is randomly initialized
        nn.init.uniform(self.word_emb.weight.data[0], -0.1, 0.1)

        nn.init.kaiming_normal(self.char_LSTM.weight_ih_l0)
        nn.init.constant(self.char_LSTM.bias_ih_l0, val=0)
        nn.init.orthogonal(self.char_LSTM.weight_hh_l0)
        nn.init.constant(self.char_LSTM.bias_hh_l0, val=0)

        # ----- Context Representation Layer -----
        nn.init.kaiming_normal(self.context_LSTM.weight_ih_l0)
        nn.init.constant(self.context_LSTM.bias_ih_l0, val=0)
        nn.init.orthogonal(self.context_LSTM.weight_hh_l0)
        nn.init.constant(self.context_LSTM.bias_hh_l0, val=0)

        nn.init.kaiming_normal(self.context_LSTM.weight_ih_l0_reverse)
        nn.init.constant(self.context_LSTM.bias_ih_l0_reverse, val=0)
        nn.init.orthogonal(self.context_LSTM.weight_hh_l0_reverse)
        nn.init.constant(self.context_LSTM.bias_hh_l0_reverse, val=0)

        # ----- Prediction Layer ----
        nn.init.uniform(self.pred_fc1.weight, -0.005, 0.005)
        nn.init.constant(self.pred_fc1.bias, val=0)

        nn.init.uniform(self.pred_fc2.weight, -0.005, 0.005)
        nn.init.constant(self.pred_fc2.bias, val=0)

    def dropout(self, v):
        return F.dropout(v, p=self.args.dropout, training=self.training)

    def forward(self, **kwargs):
        p = self.word_emb(kwargs['p'])
        h = self.word_emb(kwargs['h'])

        if self.args.use_char_emb:
            # (batch, seq_len, max_word_len) -> (batch * seq_len, max_word_len)
            seq_len_p = kwargs['char_p'].size(1)
            seq_len_h = kwargs['char_h'].size(1)

            char_p = kwargs['char_p'].view(-1, self.args.max_word_len)
            char_h = kwargs['char_h'].view(-1, self.args.max_word_len)

            # (batch * seq_len, max_word_len, char_dim)-> (1, batch * seq_len, char_hidden_size)
            _, (char_p, _) = self.char_LSTM(self.char_emb(char_p))
            _, (char_h, _) = self.char_LSTM(self.char_emb(char_h))

            # (batch, seq_len, char_hidden_size)
            char_p = char_p.view(-1, seq_len_p, self.args.char_hidden_size)
            char_h = char_h.view(-1, seq_len_h, self.args.char_hidden_size)

            # (batch, seq_len, word_dim + char_hidden_size)
            p = torch.cat([p, char_p], dim=-1)
            h = torch.cat([h, char_h], dim=-1)

        p = self.dropout(p)
        h = self.dropout(h)

        # ----- Context Representation Layer -----
        # (batch, seq_len, hidden_size * 2)
        #self.context_LSTM.flatten_parameters()
        con_p, _ = self.context_LSTM(p)
        con_h, _ = self.context_LSTM(h)
        

        #print(con_p.shape)
        con_p_fw, con_p_bw = torch.split(con_p, self.args.hidden_size, dim=-1)
        con_h_fw, con_h_bw = torch.split(con_h, self.args.hidden_size, dim=-1)
        
        
        p_key = torch.cat([con_p_fw[:,-1,:],con_p_bw[:,0,:]], dim=-1)
        h_key = torch.cat([con_h_fw[:,-1,:],con_h_bw[:,0,:]], dim=-1)
        #print(self.Ws.shape, con_p.shape, self.Us.shape, p_key.shape, torch.matmul(con_p,self.Ws).shape , torch.matmul(p_key,self.Us).view(-1,1,self.args.hidden_size*2).expand(con_p.shape[0],con_p.shape[1],con_p.shape[2]).shape , self.bs.shape)
        sGatep = F.sigmoid(torch.matmul(con_p,self.Ws) + torch.matmul(h_key,self.Us).view(-1,1,self.args.hidden_size*2).expand(con_p.shape[0],con_p.shape[1],con_p.shape[2]) + self.bs)
        sGateh = F.sigmoid(torch.matmul(con_h,self.Ws) + torch.matmul(p_key,self.Us).view(-1,1,self.args.hidden_size*2).expand(con_h.shape[0],con_h.shape[1],con_h.shape[2]) + self.bs)
        con_p = sGatep * con_p
        con_h = sGateh * con_h
        con_p_fw, con_p_bw = torch.split(con_p, self.args.hidden_size, dim=-1)
        con_h_fw, con_h_bw = torch.split(con_h, self.args.hidden_size, dim=-1)
        con_p_mean = torch.mean(con_p, 1, True)
        con_h_mean = torch.mean(con_h, 1, True)
        #print(p_enc_output_mean.shape, h_enc_output_mean.shape)
        x = torch.cat(
            [con_p_mean,con_h_mean], dim=-1)
        
        #p_key = torch.cat([con_p_fw[:,-1,:],con_p_bw[:,0,:]], dim=-1)
        #h_key = torch.cat([con_h_fw[:,-1,:],con_h_bw[:,0,:]], dim=-1)

        #print(h_key.shape, con_p.shape, con_h_fw.shape,con_h_fw[:,-1,:].shape, )
        #p_attn_output, attn_p = self.attention(h_key.view(-1,1,h_key.shape[1]), con_p)
        
        #h_attn_output, attn_h = self.attention(p_key.view(-1,1,p_key.shape[1]), con_h)
        #print(p_attn_output.shape)
        x = torch.cat([con_p_mean, con_h_mean], dim=-1)
        
        #print(con_p_fw[:,-1,:].shape)
        # 2 * (2, batch, hidden_size) -> 2 * (batch, hidden_size * 2) -> (batch, hidden_size * 4)
        #x = torch.cat(
        #    [con_p_fw[:,-1,:],con_p_bw[:,0,:],con_h_fw[:,-1,:],con_h_bw[:,0,:]], dim=-1)
        
        #print(x.shape)
        x = self.dropout(x)

        # ----- Prediction Layer -----
        x = F.tanh(self.pred_fc1(x))
        x = self.dropout(x)
        x = self.pred_fc2(x)
        #print(x.shape)
        return x


In [8]:
import argparse
import copy
import os
import torch

from torch import nn, optim
from torch.autograd import Variable
from tensorboardX import SummaryWriter
from time import gmtime, strftime

from model.BIMPM import BIMPM
from model.utils import SNLI, Quora



def train(args, data):
    model = (Siamese(args, data,use_attention = True))
    if args.gpu > -1:
        model.cuda()

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(parameters, lr=args.learning_rate)
    criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()
    loss, last_epoch = 0, -1
    max_dev_acc, max_test_acc = 0, 0

    iterator = data.train_iter
    for i, batch in enumerate(iterator):
        present_epoch = int(iterator.epoch)
        if present_epoch == args.epoch:
            break
        if present_epoch > last_epoch:
            print('epoch:', str(present_epoch + 1))
        last_epoch = present_epoch

        if args.data_type == 'SNLI':
            s1, s2 = 'premise', 'hypothesis'
        else:
            s1, s2 = 'q1', 'q2'

        s1, s2 = getattr(batch, s1), getattr(batch, s2)

        # limit the lengths of input sentences up to max_sent_len
        if args.max_sent_len >= 0:
            if s1.size()[1] > args.max_sent_len:
                s1 = s1[:, :args.max_sent_len]
            if s2.size()[1] > args.max_sent_len:
                s2 = s2[:, :args.max_sent_len]

        kwargs = {'p': s1, 'h': s2}

        if args.use_char_emb:
            char_p = Variable(torch.LongTensor(data.characterize(s1)))
            char_h = Variable(torch.LongTensor(data.characterize(s2)))

            if args.gpu > -1:
                char_p = char_p.cuda()
                char_h = char_h.cuda()

            kwargs['char_p'] = char_p
            kwargs['char_h'] = char_h

        pred = (model(**kwargs))
        
        optimizer.zero_grad()
        #print(pred.shape, batch.label.shape)
        batch_loss = criterion(pred.view(-1,2), batch.label)
        loss += batch_loss.data[0]
        batch_loss.backward()
        optimizer.step()
        del pred
        del batch_loss
        if (i + 1) % args.print_freq == 0:
            dev_loss, dev_acc = test(model, args, data, mode='dev')
            test_loss, test_acc = test(model, args, data)
            c = (i + 1) // args.print_freq

            writer.add_scalar('loss/train', loss, c)
            writer.add_scalar('loss/dev', dev_loss, c)
            writer.add_scalar('acc/dev', dev_acc, c)
            writer.add_scalar('loss/test', test_loss, c)
            writer.add_scalar('acc/test', test_acc, c)

            print('train loss: '+ str(loss) +' / dev loss: '+ str(dev_loss) + '/ test loss:' + str(test_loss) +
                  ' / dev acc:' + str(dev_acc) + 'test acc:' + str(test_acc))

            if dev_acc > max_dev_acc:
                max_dev_acc = dev_acc
                max_test_acc = test_acc
                best_model = copy.deepcopy(model)

            loss = 0
            model.train()

    writer.close()
    print('max dev acc:'+ str(max_dev_acc) + '/ max test acc: ' + str(max_test_acc))

    return best_model


def main():
    import sys
    sys.argv = ['foo']
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch-size', default=128, type=int)
    parser.add_argument('--char-dim', default=20, type=int)
    parser.add_argument('--char-hidden-size', default=50, type=int)
    parser.add_argument('--data-type', default='Quora', help='available: SNLI or Quora')
    parser.add_argument('--dropout', default=0.1, type=float)
    parser.add_argument('--epoch', default=15, type=int)
    parser.add_argument('--gpu', default=2, type=int)
    parser.add_argument('--hidden-size', default=100, type=int)
    parser.add_argument('--learning-rate', default=0.001, type=float)
    parser.add_argument('--max-sent-len', default=-1, type=int,
                        help='max length of input sentences model can accept, if -1, it accepts any length')
    parser.add_argument('--num-perspective', default=20, type=int)
    parser.add_argument('--print-freq', default=500, type=int)
    parser.add_argument('--use-char-emb', default=False, action='store_true')
    parser.add_argument('--word-dim', default=300, type=int)
    parser.add_argument('--training', default=0, type=int)
    args = parser.parse_args()
    print(args.training)
    if args.data_type == 'SNLI':
        print('loading SNLI data...')
        data = SNLI(args)
    elif args.data_type == 'Quora':
        print('loading Quora data...')
        data = Quora(args)
    else:
        raise NotImplementedError('only SNLI or Quora data is possible')

    setattr(args, 'char_vocab_size', len(data.char_vocab))
    setattr(args, 'word_vocab_size', len(data.TEXT.vocab))
    setattr(args, 'class_size', len(data.LABEL.vocab))
    setattr(args, 'max_word_len', data.max_word_len)
    setattr(args, 'model_time', strftime('%H:%M:%S', gmtime()))

    print('training start!')
    best_model = train(args, data)

    if not os.path.exists('saved_models'):
        os.makedirs('saved_models')
    torch.save(best_model.state_dict(), 'saved_models/BIBPM_'+args.data_type+'_'+args.model_time+'train'+args.training+'.pt')
    print('training finished!')


if __name__ == '__main__':
    main()


0
loading Quora data...
training start!
('epoch:', '1')
train loss: 290.245848909 / dev loss: 49.9691486657/ test loss:49.978054136 / dev acc:0.660299956799test acc:0.660499989986
train loss: 269.034368232 / dev loss: 45.4503088593/ test loss:45.5297141671 / dev acc:0.703099966049test acc:0.696699976921
train loss: 259.997570246 / dev loss: 41.835370332/ test loss:42.0734361783 / dev acc:0.725899994373test acc:0.718499958515
train loss: 251.199654534 / dev loss: 42.2082631886/ test loss:42.6529608816 / dev acc:0.731000006199test acc:0.719199955463
train loss: 240.734072134 / dev loss: 40.1081396937/ test loss:40.411663115 / dev acc:0.740599989891test acc:0.735599994659
train loss: 233.664630499 / dev loss: 39.4517196119/ test loss:39.7849212885 / dev acc:0.752999961376test acc:0.746899962425
('epoch:', '2')
train loss: 223.945155464 / dev loss: 38.1399104297/ test loss:37.8885010853 / dev acc:0.75729995966test acc:0.755099952221
train loss: 219.128108688 / dev loss: 36.7874051929/ test

train loss: 118.603808068 / dev loss: 31.6151800305/ test loss:30.9301172346 / dev acc:0.83679997921test acc:0.835199952126
('epoch:', '12')
train loss: 105.570005868 / dev loss: 33.889584139/ test loss:32.9811750166 / dev acc:0.833899974823test acc:0.833399951458
train loss: 105.337955436 / dev loss: 34.0648011118/ test loss:32.4272062443 / dev acc:0.830999970436test acc:0.831499993801
train loss: 108.966360759 / dev loss: 34.3264336139/ test loss:33.0233236998 / dev acc:0.83259999752test acc:0.83269995451
train loss: 112.755986178 / dev loss: 33.0980727822/ test loss:32.0825005472 / dev acc:0.838999986649test acc:0.833999991417
train loss: 111.593230549 / dev loss: 33.594738394/ test loss:32.4271244779 / dev acc:0.835999965668test acc:0.839100003242
train loss: 113.775268637 / dev loss: 32.6720379889/ test loss:32.0053973049 / dev acc:0.837699949741test acc:0.837699949741
('epoch:', '13')
train loss: 100.733207174 / dev loss: 33.7053302228/ test loss:32.8795096204 / dev acc:0.8376999

TypeError: cannot concatenate 'str' and 'int' objects

In [None]:
rnn = nn.LSTM(10, 20, 1, bidirectional=True)
input = Variable(torch.randn(5, 3, 10))
h0 = Variable(torch.randn(2, 3, 20))
c0 = Variable(torch.randn(2, 3, 20))
output, hn = rnn(input, (h0, c0))

In [None]:
hn

In [None]:
len(hn)

In [7]:
os.environ['CUDA_VISIBLE_DEVICES'] =str(1)