In [1]:
from torchtext import data, datasets
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import re
import random

In [2]:
inputs = datasets.snli.ParsedTextField(lower=True)
answers = data.Field(sequential=False)

train, dev, test = datasets.SNLI.splits(inputs, answers)

inputs.build_vocab(train, dev, test)
answers.build_vocab(train)

train_iter, dev_iter, test_iter = data.BucketIterator.splits(
            (train, dev, test), batch_size=32, device=-1)

In [3]:
# A Multi-Layer Perceptron (MLP)
class DecomposableAttention(nn.Module): # inheriting from nn.Module!
    
    def __init__(self, hidden_dim, num_labels):
        super(DecomposableAttention, self).__init__()
                
        '''Embedding'''
        self.embedding_dim = embedding_dim 
        self.hidden_dim = hidden_dim
        self.embed = nn.Embedding(input_size, embedding_dim, padding_idx=0)
        
        self.num_labels = num_labels
        self.dropout = nn.Dropout(p=0.2)
        
        # input layer
        self.input_linear = nn.Linear(embedding_dim, hidden_dim, bias=False)
    
        # layer F, G, and H are feed forward nn with ReLu
        self.mlp_F = self.mlp(hidden_dim, hidden_dim)
        self.mlp_G = self.mlp(2 * hidden_dim, hidden_dim)
        self.mlp_H = self.mlp(2 * hidden_dim, hidden_dim)
        
        
        # final layer will not use dropout, so defining independently 
        self.linear_final = nn.Linear(hidden_dim, num_labels, bias = False)
    

    def mlp(self, input_dim, output_dim):
        '''
        function define a feed forward neural network with ReLu activations 
        @input: dimension specifications
        
        ToDo: 
            1. bias 
            2. args of dropout(maybe) 
            3. initialize para   
        '''
        feed_forward = []
        feed_forward.append(self.dropout)
        feed_forward.append(nn.Linear(input_dim, output_dim, bias=False))
        feed_forward.append(nn.ReLU())
        feed_forward.append(self.dropout)
        feed_forward.append(nn.Linear(output_dim, output_dim, bias=False))
        feed_forward.append(nn.ReLU()) 
        return nn.Sequential(*feed_forward)

    
    def forward(self, prem, hypo):

        '''Input layer'''
        prem_emb = self.embed(prem)
        hypo_emb = self.embed(hypo)
        prem_emb = self.input_linear(prem_emb)
        hypo_emb = self.input_linear(hypo_emb)
        
        '''Attend'''
        f_prem = self.mlp_F(prem_emb)
        f_hypo = self.mlp_F(hypo_emb)

        e_ij = torch.bmm(f_prem, torch.transpose(f_hypo, 1, 2))
        beta_ij = F.softmax(e_ij)
        beta_i = torch.bmm(beta_ij, hypo_emb)

        e_ji = torch.transpose(e_ij, 1, 2)
        alpha_ji = F.softmax(e_ji)
        alpha_j = torch.bmm(alpha_ji, prem_emb)
          
        
        '''Compare'''
        concat_1 = torch.cat((prem_emb, beta_i), 2)       
        concat_2 = torch.cat((hypo_emb, alpha_j), 2)
        
        compare_1 = self.mlp_G(concat_1)
        compare_2 = self.mlp_G(concat_2)
        
        
        '''Aggregate'''
        v_1 = torch.sum(compare_1, 1)
        v_2 = torch.sum(compare_2, 1)
        v_concat = torch.cat((v_1, v_2), 1)
        
        y_pred = self.mlp_H(v_concat)
    
    
        '''Final layer'''
        out = F.log_softmax(self.linear_final(y_pred))
        
        return out

In [4]:
def training_loop(model, loss, optimizer, train_iter, dev_iter):
    step = 0
    for i in range(num_train_steps):
        model.train()
        for batch in train_iter:
            premise = batch.premise.transpose(0,1)
            hypothesis = batch.hypothesis.transpose(0,1)
            labels = batch.label-1
            model.zero_grad()
        
            output = model(premise, hypothesis)
            
            lossy = loss(output, labels)
            #print(lossy)
            lossy.backward()
            optimizer.step()

            if step % 10 == 0:
                print( "Step %i; Loss %f; Dev acc %f" 
                %(step, lossy.data[0], evaluate(model, dev_iter)))

            step += 1

In [5]:
def evaluate(model, data_iter):
    model.eval()
    correct = 0
    total = 0
    for batch in data_iter:
        premise = batch.premise.transpose(0,1)
        hypothesis = batch.hypothesis.transpose(0,1)
        labels = (batch.label-1).data
        output = model(premise, hypothesis)
        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()
    model.train()
    return correct / float(total)

In [6]:
vocab_size = len(inputs.vocab)
input_size = vocab_size
num_labels = 3
hidden_dim = 50
embedding_dim = 300
batch_size = 32
learning_rate = 0.004
num_train_steps = 1000

In [7]:
model = DecomposableAttention(hidden_dim, num_labels)
    
# Loss and Optimizer
loss = nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
training_loop(model, loss, optimizer, train_iter, dev_iter)



Step 0; Loss 1.110984; Dev acc 0.328693
Step 10; Loss 1.110382; Dev acc 0.345560
Step 20; Loss 1.109667; Dev acc 0.328389


KeyboardInterrupt: 