# LSTM/Bi-LSTM for Part-of-Speech Tagging

In [87]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F 

import numpy as np

In [59]:
def build_dict(training_data):
    
    word2index = {}
    tag2index = {}
    for sentence,tags in training_data:
        for word in sentence:
            if word not in word2index:
                word2index[word] = len(word2index)
        for tag in tags:
            if tag not in tag2index:
                tag2index[tag] = len(tag2index)
    
    return word2index, tag2index

data download:
https://github.com/llhthinker/nlptutorial-exercise/blob/master/data/wiki-en-train.norm_pos

In [62]:
training_file = './data/wiki-en-train.norm_pos'
data = load_data(training_file)
word2index, tag2index = build_dict(data)

In [63]:
print(data[0])
data_length = len(data)
print(data_length)
split = int(data_length * 0.7)
training_data = data[:split]
valid_data = data[split:]

(['Natural', 'language', 'processing', '-LRB-', 'NLP', '-RRB-', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', '-LRB-', 'also', 'called', 'machine', 'learning', '-RRB-', ',', 'and', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '-LRB-', 'natural', '-RRB-', 'languages', '.'], ['JJ', 'NN', 'NN', '-LRB-', 'NN', '-RRB-', 'VBZ', 'DT', 'NN', 'IN', 'NN', 'NN', ',', 'JJ', 'NN', '-LRB-', 'RB', 'VBN', 'NN', 'NN', '-RRB-', ',', 'CC', 'NNS', 'VBN', 'IN', 'DT', 'NNS', 'IN', 'NNS', 'CC', 'JJ', '-LRB-', 'JJ', '-RRB-', 'NNS', '.'])
1301


## LSTM
Ref: http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#lstm-s-in-pytorch

In [64]:
def prepare_seq(seq, seq2index):
    indexs = [seq2index[w] for w in seq]
    tensor = torch.LongTensor(indexs)
    return Variable(tensor)

In [65]:
class Config():
    def __init__(self):
        self.vocab_size = len(word2index)
        self.tagset_size = len(tag2index)
        self.embedding_dim = 16
        self.hidden_dim = 16

config = Config()

In [66]:
class LSTMTagger(nn.Module):
    def __init__(self, config):
        super(LSTMTagger, self).__init__()
        
        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
        
        self.lstm = nn.LSTM(input_size=config.embedding_dim, hidden_size=config.hidden_dim)
        
        self.hidden2tag = nn.Linear(in_features=config.hidden_dim, out_features=config.tagset_size)
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (Variable(torch.zeros(1, 1, config.hidden_dim)),
                Variable(torch.zeros(1, 1, config.hidden_dim)))
    
    
    def forward(self, sentence):
        embed = self.embedding(sentence)
        # lstm input (seq_len, batch, input_size)
        embed = embed.view(len(sentence), 1, -1)
        lstm_out = self.lstm(embed)[0]
#         print(lstm_out.size())
        # output (seq_len, batch, hidden_size * num_directions)
        # -> (seq_len, hidden_size * num_directions)
        lstm_out_reshape = lstm_out.view(len(sentence), -1)
#         print(lstm_out_reshape.size())
        tag_space = self.hidden2tag(lstm_out_reshape)
        tag_scores = F.log_softmax(tag_space)
        
        return tag_scores

In [111]:
def do_eval(model, valid_data):
    acc = 0.0
    for sentence, tags in valid_data:
        sentence_in = prepare_seq(sentence, word2index)
        targets = prepare_seq(tags, tag2index)
        tag_scores = model(sentence_in)
        _, predicted = torch.max(tag_scores.data, 1)
        predicted = predicted.view(len(targets)).numpy()
        targets = targets.data.numpy()
        correct_num = np.sum((predicted == targets))
        acc += correct_num / len(sentence)
        
    print("Valid set accuracy:", acc / len(valid_data))

In [112]:
model = LSTMTagger(config)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)


for epoch in range(10):
    count = 0
    running_loss = 0.0
    for sentence, tags in training_data:
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_seq(sentence, word2index)
        targets = prepare_seq(tags, tag2index)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.data[0]
        if count % 200 == 199:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, count + 1, running_loss / 2000))
            running_loss = 0.0
        count += 1

    do_eval(model, valid_data)


[1,   200] loss: 0.324
[1,   400] loss: 0.278
[1,   600] loss: 0.255
[1,   800] loss: 0.240
Valid set accuracy: 0.402996500242
[2,   200] loss: 0.210
[2,   400] loss: 0.202
[2,   600] loss: 0.193
[2,   800] loss: 0.192
Valid set accuracy: 0.515497941108
[3,   200] loss: 0.172
[3,   400] loss: 0.170
[3,   600] loss: 0.167
[3,   800] loss: 0.171
Valid set accuracy: 0.572909377034
[4,   200] loss: 0.154
[4,   400] loss: 0.153
[4,   600] loss: 0.152
[4,   800] loss: 0.158
Valid set accuracy: 0.598427265634
[5,   200] loss: 0.142
[5,   400] loss: 0.142
[5,   600] loss: 0.141
[5,   800] loss: 0.149
Valid set accuracy: 0.617734514808
[6,   200] loss: 0.133
[6,   400] loss: 0.133
[6,   600] loss: 0.132
[6,   800] loss: 0.142
Valid set accuracy: 0.635239141785
[7,   200] loss: 0.126
[7,   400] loss: 0.126
[7,   600] loss: 0.125
[7,   800] loss: 0.136
Valid set accuracy: 0.648055034768
[8,   200] loss: 0.119
[8,   400] loss: 0.119
[8,   600] loss: 0.118
[8,   800] loss: 0.130
Valid set accuracy:

## Bi-LSTM

In [116]:
class BiLSTMTagger(nn.Module):
    def __init__(self, config):
        super(BiLSTMTagger, self).__init__()
        
        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
        
        self.lstm = nn.LSTM(input_size=config.embedding_dim, 
                            hidden_size=config.hidden_dim, 
                            num_layers= 2,
                            bidirectional=True)
        
        self.hidden2tag = nn.Linear(in_features=config.hidden_dim*2, out_features=config.tagset_size)
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (Variable(torch.zeros(1, 1, config.hidden_dim)),
                Variable(torch.zeros(1, 1, config.hidden_dim)))
    
    
    def forward(self, sentence):
        embed = self.embedding(sentence)
        # lstm input (seq_len, batch, input_size)
        embed = embed.view(len(sentence), 1, -1)
        lstm_out = self.lstm(embed)[0]
#         print(lstm_out.size())
        # output (seq_len, batch, hidden_size * num_directions)
        # -> (seq_len, hidden_size * num_directions)
        lstm_out_reshape = lstm_out.view(len(sentence), -1)
#         print(lstm_out_reshape.size())
        tag_space = self.hidden2tag(lstm_out_reshape)
        tag_scores = F.log_softmax(tag_space)
        
        return tag_scores

In [117]:
model = BiLSTMTagger(config)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)


for epoch in range(10):
    count = 0
    running_loss = 0.0
    for sentence, tags in training_data:
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_seq(sentence, word2index)
        targets = prepare_seq(tags, tag2index)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.data[0]
        if count % 200 == 199:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, count + 1, running_loss / 2000))
            running_loss = 0.0
        count += 1

    do_eval(model, valid_data)

[1,   200] loss: 0.320
[1,   400] loss: 0.294
[1,   600] loss: 0.284
[1,   800] loss: 0.279
Valid set accuracy: 0.239784096097
[2,   200] loss: 0.267
[2,   400] loss: 0.260
[2,   600] loss: 0.251
[2,   800] loss: 0.242
Valid set accuracy: 0.394927813154
[3,   200] loss: 0.215
[3,   400] loss: 0.205
[3,   600] loss: 0.195
[3,   800] loss: 0.192
Valid set accuracy: 0.508007925312
[4,   200] loss: 0.173
[4,   400] loss: 0.168
[4,   600] loss: 0.161
[4,   800] loss: 0.167
Valid set accuracy: 0.554756792656
[5,   200] loss: 0.150
[5,   400] loss: 0.146
[5,   600] loss: 0.140
[5,   800] loss: 0.149
Valid set accuracy: 0.607302061207
[6,   200] loss: 0.133
[6,   400] loss: 0.130
[6,   600] loss: 0.125
[6,   800] loss: 0.136
Valid set accuracy: 0.628884194833
[7,   200] loss: 0.120
[7,   400] loss: 0.117
[7,   600] loss: 0.114
[7,   800] loss: 0.126
Valid set accuracy: 0.647488870077
[8,   200] loss: 0.109
[8,   400] loss: 0.107
[8,   600] loss: 0.104
[8,   800] loss: 0.117
Valid set accuracy:

## Augmenting the LSTM part-of-speech tagger with character-level features
To do