In [1]:
from torchtext import data, datasets
import torch
import torch.nn as nn
from torch.autograd import Variable
from modules import *
import torch.nn.functional as F
import re
import random

In [2]:
inputs = datasets.snli.ParsedTextField(lower=True)
answers = data.Field(sequential=False)

train, dev, test = datasets.SNLI.splits(inputs, answers)

inputs.build_vocab(train, vectors='glove.6B.300d')
answers.build_vocab(train)

train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test), batch_size=32, device=-1)

In [3]:
class Embed(nn.Module):
    def __init__(self, W_emb, vocab_size, embed_dim, train_emb):
        super(Embed, self).__init__()
        self.embed_dim = embed_dim
        self.embed = nn.Embedding(vocab_size, self.embed_dim)
        if W_emb is not None:
            self.embed.weight = nn.Parameter(W_emb)
        if train_emb == False:
            self.embed.requires_grad = False

    def forward(self, doc, qry):
        doc = self.embed(doc)  # B x D x H
        qry = self.embed(qry)  # B x Q x H
        return doc, qry


class Encoder(nn.Module):
    def __init__(self, embed_size, hidden_dim, num_layers):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.d_gru = nn.GRU(embed_size, self.hidden_dim, self.num_layers, batch_first=True, bidirectional=True)
        self.q_gru = nn.GRU(embed_size, self.hidden_dim, self.num_layers, batch_first=True, bidirectional=True)

        self.linear_d = nn.Linear(self.hidden_dim * 2, self.hidden_dim)
        self.linear_q = nn.Linear(self.hidden_dim * 2, self.hidden_dim)

    def forward(self, doc, qry, qry_h0):
        batch_size = doc.size(0)

        qry_h, doc_h0 = self.q_gru(qry, qry_h0)  # B x Q x H
        doc_h, _ = self.d_gru(doc, doc_h0)  # B x D x H

        qry_h = self.linear_q(to_2D(qry_h, self.hidden_dim * 2))
        qry_h = to_3D(qry_h, batch_size, self.hidden_dim)

        doc_h = self.linear_d(to_2D(doc_h, self.hidden_dim * 2))
        doc_h = to_3D(doc_h, batch_size, self.hidden_dim)

        return doc_h, qry_h

    def init_hidden(self, batch_size):
        hidden = next(self.parameters()).data
        num_directions = 2
        return Variable(hidden.new(self.num_layers * num_directions, batch_size, self.hidden_dim).zero_())


class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim

        self.initprev = InitPrev(self.hidden_dim)
        self.attn = AttnLayer(self.hidden_dim)

    def forward(self, doc_h, qry_h, qm):
        batch_size = doc_h.size(0)
        doc_len = doc_h.size(1)
        prev_rt_init = self.initprev(qry_h, qm)

        for i in range(doc_len):
            if i == 0:
                prev_rt = prev_rt_init
            prev_rt = self.attn(doc_h[:, i, :], qry_h, prev_rt, qm)

        return prev_rt


class OutputLayer(nn.Module):
    def __init__(self, hidden_dim):
        super(OutputLayer, self).__init__()
        self.hidden_dim = hidden_dim

        self.Wp = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.Wx = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.linear = nn.Linear(self.hidden_dim, 3)

    def forward(self, rN, hN):
        rn_enc = to_2D(rN, self.hidden_dim)
        hn_enc = to_2D(hN, self.hidden_dim)

        rn_enc = self.Wp(rn_enc)
        hn_enc = self.Wx(hn_enc)

        h = F.tanh(rn_enc + hn_enc)
        h = self.linear(h)
        return h


class Entailment(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, W_emb=None, p=0.3, num_layers=1, train_emb=True):
        super(Entailment, self).__init__()

        self.embed = Embed(W_emb, vocab_size, embed_dim, train_emb)
        self.encoder = Encoder(embed_dim, hidden_dim, num_layers)
        self.attention = Attention(hidden_dim)
        self.out = OutputLayer(hidden_dim)

    def forward(self, doc, qry, dm, qm):
        batch_size = doc.size(0)

        # Embedding the matrix
        d_emb, q_emb = self.embed(doc, qry)
        qry_h0 = self.encoder.init_hidden(batch_size)
        doc_h, qry_h = self.encoder(d_emb, q_emb, qry_h0)

        rt = self.attention(doc_h, qry_h, qm)
        output = self.out(rt, doc_h[:, -1, :])

        return output

In [8]:
def training_loop(model, loss, optimizer, train_iter, dev_iter, embed_dim, hidden_dim):
    step = 0
    for i in range(num_train_steps):
        model.train()
        for batch in train_iter:
            premise = batch.premise.transpose(0, 1)
            hypothesis = batch.hypothesis.transpose(0, 1)
            labels = batch.label - 1
            model.zero_grad()
        
            output = model(premise, hypothesis, embed_dim, hidden_dim)
            
            lossy = loss(output, labels)
            #print(lossy)
            lossy.backward()
            optimizer.step()

            if step % 10 == 0:
                print( "Step %i; Loss %f; Dev acc %f" %(step, lossy.data[0], evaluate(model, dev_iter, embed_dim, hidden_dim)))

            step += 1

In [9]:
def evaluate(model, data_iter, embed_dim, hidden_dim):
    model.eval()
    correct = 0
    total = 0
    for batch in data_iter:
        premise = batch.premise.transpose(0,1)
        hypothesis = batch.hypothesis.transpose(0,1)
        labels = (batch.label-1).data
        output = model(premise, hypothesis, embed_dim, hidden_dim)
        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()
    model.train()
    return correct / float(total)

In [10]:
vocab_size = len(inputs.vocab)
input_size = vocab_size
num_labels = 3
hidden_dim = 50
embed_dim = 300
batch_size = 32
learning_rate = 0.004
num_train_steps = 1000

In [11]:
word_vecs = inputs.vocab.vectors

model = Entailment(vocab_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim, W_emb=word_vecs, p=0.3, num_layers=1, train_emb=False)
    
# Loss and Optimizer
loss = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
training_loop(model, loss, optimizer, train_iter, dev_iter, embed_dim, hidden_dim)

  return a.add(b)
  """


Step 0; Loss 1.095201; Dev acc 0.338244
Step 10; Loss 1.233302; Dev acc 0.338244
Step 20; Loss 1.080545; Dev acc 0.329709
Step 30; Loss 1.076017; Dev acc 0.357651
Step 40; Loss 1.061198; Dev acc 0.363442
Step 50; Loss 1.107874; Dev acc 0.346169
Step 60; Loss 1.052614; Dev acc 0.490754
Step 70; Loss 0.966587; Dev acc 0.438021
Step 80; Loss 1.125886; Dev acc 0.504979
Step 90; Loss 1.066675; Dev acc 0.456208
Step 100; Loss 1.114487; Dev acc 0.497663
Step 110; Loss 1.093504; Dev acc 0.495936
Step 120; Loss 0.999009; Dev acc 0.476224
Step 130; Loss 1.077871; Dev acc 0.481914
Step 140; Loss 1.083681; Dev acc 0.515139
Step 150; Loss 0.964586; Dev acc 0.515241
Step 160; Loss 1.010302; Dev acc 0.524080
Step 170; Loss 0.997679; Dev acc 0.506198
Step 180; Loss 0.956119; Dev acc 0.518391
Step 190; Loss 1.125636; Dev acc 0.514326
Step 200; Loss 1.066354; Dev acc 0.511989
Step 210; Loss 0.960280; Dev acc 0.526011
Step 220; Loss 0.901572; Dev acc 0.537391
Step 230; Loss 1.055558; Dev acc 0.546942
Ste

KeyboardInterrupt: 