In [14]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
torch.manual_seed(1)
from torchtext import datasets
from torchtext.vocab import GloVe
from torchtext.data import Iterator, BucketIterator
import torch.nn.functional as F

In [15]:
import sys
sys.path.append("../code/")
from data.utils import DatasetUtil
from torchtext.data import Field

In [16]:
DATAPATH = "../data/mit-movie-1/original/"
TRAIN_FILE = "engtrain.bio"
TEST_FILE = "engtest.bio"
TRAIN_BATCH_SIZE = 4
TEST_BATCH_SIZE = 4

In [17]:
args = {}
args['cuda'] = False
args['datapath'] = DATAPATH
args['filename'] = TRAIN_FILE
args['batch_size'] = TRAIN_BATCH_SIZE
datasetutil = DatasetUtil(args)
train_iter = datasetutil.get_train_iterator()

In [18]:
args['filename'] = TEST_FILE
args['batch_size'] = TEST_BATCH_SIZE
test_iter = datasetutil.get_iterator(args)

In [19]:
for batch in test_iter:pass
test_stoi = batch.dataset.fields['word'].vocab.stoi

for batch in train_iter:pass
train_stoi = batch.dataset.fields['word'].vocab.stoi

train_stoi == test_stoi

True

In [29]:
class BiLSTM(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim, bidirectional=True):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        self.ix_to_tag = {v:k for k,v in self.tag_to_ix.items()}

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=bidirectional)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.loss_function = nn.NLLLoss()



    def init_hidden(self,batch_size):
        return (autograd.Variable(torch.randn(2, batch_size, self.hidden_dim // 2)),
                autograd.Variable(torch.randn(2, batch_size, self.hidden_dim // 2)))


    def _get_lstm_features(self, batch):
        batch_size = batch.size()[1]
        #self.hidden = self.init_hidden(batch_size)
        embeds = self.word_embeds(batch)
        lstm_out, self.hidden = self.lstm(embeds)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def loss(self,batch,tags):
        score, tag_seq = self.forward(batch)
        total_loss = self.loss_function(score, tags.view(batch_size * seq_len))
        return total_loss


    def forward(self, batch):  
        batch_size = batch.size()[1]
        seq_len = batch.size()[0]
        lstm_feats = self._get_lstm_features(batch)
        lstm_feats = lstm_feats.view(batch_size * seq_len, -1)
        score = F.log_softmax(lstm_feats, 1)
        _, tag_seq  = torch.max(lstm_feats, dim=1)
        tag_seq = tag_seq.view(batch_size, seq_len)
        return score, tag_seq


In [23]:
EMBEDDING_DIM = 5
HIDDEN_DIM = 14
model = BiLSTM(len(datasetutil.WORD.vocab.stoi),datasetutil.TAG.vocab.stoi, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [24]:
for epoch in range(
        100):  
    print("epoch = ",epoch)
    epoch_loss = 0
    for idx ,batch in enumerate(train_iter):

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        loss = model.loss(batch.word, batch.tag)
        epoch_loss+=loss
        loss.backward()
        optimizer.step()

    print(epoch_loss)


epoch =  0
Variable containing:
1.00000e+05 *
  1.3912
[torch.FloatTensor of size 1]

epoch =  1
Variable containing:
 92496.3125
[torch.FloatTensor of size 1]

epoch =  2
Variable containing:
 75364.6719
[torch.FloatTensor of size 1]

epoch =  3
Variable containing:
 66269.1797
[torch.FloatTensor of size 1]

epoch =  4
Variable containing:
 60401.6328
[torch.FloatTensor of size 1]

epoch =  5
Variable containing:
 56254.1680
[torch.FloatTensor of size 1]

epoch =  6
Variable containing:
 53093.1641
[torch.FloatTensor of size 1]

epoch =  7
Variable containing:
 50656.6719
[torch.FloatTensor of size 1]

epoch =  8
Variable containing:
 48493.1289
[torch.FloatTensor of size 1]

epoch =  9
Variable containing:
 46665.6523
[torch.FloatTensor of size 1]

epoch =  10
Variable containing:
 45029.0039
[torch.FloatTensor of size 1]

epoch =  11
Variable containing:
 43725.8164
[torch.FloatTensor of size 1]

epoch =  12
Variable containing:
 42479.7461
[torch.FloatTensor of size 1]

epoch =  13

KeyboardInterrupt: 

In [25]:
def persist_output(path,Actual,Preds,Data):
    fout = open(path,"w")
    for i in range(0,len(Preds)):
        sentence = Data[i]
        predicted = Preds[i]
        actual = Actual[i]
        for idx in range(0,len(actual)):
            fout.write(" ".join([sentence[idx],'UNK','UNK',actual[idx],predicted[idx]]) + "\n")
    fout.close()

In [26]:
def get_output(model,dataset_iter):
    Actual = []
    Preds = []
    Sentences = []
    for batch in dataset_iter:
        score,tags = model(batch.word)
        for i in range(tags.size()[0]):
            sentence = batch.word[:,i]
            actual_tags = batch.tag[:,i]
            predicted_tags = tags[i,:]
            sentence = [datasetutil.WORD.vocab.itos[idx.data[0]] for idx in sentence]
            predicted = [datasetutil.TAG.vocab.itos[idx.data[0]] for idx in predicted_tags]
            actual = [datasetutil.TAG.vocab.itos[idx.data[0]] for idx in actual_tags]
            Actual.append(actual)
            Preds.append(predicted)
            Sentences.append(sentence)
    return Actual,Preds,Sentences

In [27]:
Actual,Preds,Sentences = get_output(model,test_iter)
test_output_path = "test.res"
persist_output(test_output_path,Actual,Preds,Sentences)

In [28]:
Actual,Preds,Sentences = get_output(model,train_iter)
train_output_path = "train.res"
persist_output(train_output_path,Actual,Preds,Sentences)