# LSTM/Bi-LSTM for Part-of-Speech Tagging

In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F 

import numpy as np

In [51]:
def load_data(training_file):
    #  # Input data format is "natural_JJ language_NN..."
    training_data = []
    with open(training_file, 'r') as f:
        for line in f:
            words, tags = list(), list()
            for word_tag in line.split():
                w, t = tuple(word_tag.split('_'))
                words.append(w)
                tags.append(t)
            training_data.append((words, tags))
    return training_data
            

In [52]:
def build_dict(training_data):
    
    word2index = {}
    tag2index = {}
    for sentence,tags in training_data:
        for word in sentence:
            if word not in word2index:
                word2index[word] = len(word2index)
        for tag in tags:
            if tag not in tag2index:
                tag2index[tag] = len(tag2index)
    
    return word2index, tag2index

data download:
https://github.com/llhthinker/nlptutorial-exercise/blob/master/data/wiki-en-train.norm_pos

In [53]:
training_file = './data/wiki-en-train.norm_pos'
data = load_data(training_file)
word2index, tag2index = build_dict(data)

In [85]:
print(data[0])
data_length = len(data)
print(data_length)
split = int(data_length * 0.7)
training_data = data[:split]
valid_data = data[split:]

(['Natural', 'language', 'processing', '-LRB-', 'NLP', '-RRB-', 'is', 'a', 'field', 'of', 'computer', 'science', ',', 'artificial', 'intelligence', '-LRB-', 'also', 'called', 'machine', 'learning', '-RRB-', ',', 'and', 'linguistics', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', '-LRB-', 'natural', '-RRB-', 'languages', '.'], ['JJ', 'NN', 'NN', '-LRB-', 'NN', '-RRB-', 'VBZ', 'DT', 'NN', 'IN', 'NN', 'NN', ',', 'JJ', 'NN', '-LRB-', 'RB', 'VBN', 'NN', 'NN', '-RRB-', ',', 'CC', 'NNS', 'VBN', 'IN', 'DT', 'NNS', 'IN', 'NNS', 'CC', 'JJ', '-LRB-', 'JJ', '-RRB-', 'NNS', '.'])
1301


In [57]:
def prepare_seq(seq, seq2index):
    indexs = [seq2index[w] for w in seq]
    tensor = torch.LongTensor(indexs)
    return Variable(tensor)

## LSTM
Ref: http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#lstm-s-in-pytorch

In [79]:
def do_eval(model, valid_data):
    acc = 0.0
    for sentence, tags in valid_data:
        sentence_in = prepare_seq(sentence, word2index)
        targets = prepare_seq(tags, tag2index)
        tag_scores = model(sentence_in)
        _, predicted = torch.max(tag_scores.data, 1)
        predicted = predicted.view(len(targets)).numpy()
        targets = targets.data.numpy()
        correct_num = np.sum((predicted == targets))
        acc += correct_num / len(sentence)
        
    print("Valid set accuracy:", acc / len(valid_data))

In [65]:
class Config():
    def __init__(self):
        self.vocab_size = len(word2index)
        self.tagset_size = len(tag2index)
        self.embedding_dim = 16
        self.hidden_dim = 16

config = Config()

In [66]:
class LSTMTagger(nn.Module):
    def __init__(self, config):
        super(LSTMTagger, self).__init__()
        
        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
        
        self.lstm = nn.LSTM(input_size=config.embedding_dim, hidden_size=config.hidden_dim)
        
        self.hidden2tag = nn.Linear(in_features=config.hidden_dim, out_features=config.tagset_size)
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (Variable(torch.zeros(1, 1, config.hidden_dim)),
                Variable(torch.zeros(1, 1, config.hidden_dim)))
    
    
    def forward(self, sentence):
        embed = self.embedding(sentence)
        # lstm input (seq_len, batch, input_size)
        embed = embed.view(len(sentence), 1, -1)
        lstm_out = self.lstm(embed)[0]
#         print(lstm_out.size())
        # output (seq_len, batch, hidden_size * num_directions)
        # -> (seq_len, hidden_size * num_directions)
        lstm_out_reshape = lstm_out.view(len(sentence), -1)
#         print(lstm_out_reshape.size())
        tag_space = self.hidden2tag(lstm_out_reshape)
        tag_scores = F.log_softmax(tag_space)
        
        return tag_scores

In [112]:
model = LSTMTagger(config)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)


for epoch in range(10):
    count = 0
    running_loss = 0.0
    for sentence, tags in training_data:
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_seq(sentence, word2index)
        targets = prepare_seq(tags, tag2index)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.data[0]
        if count % 200 == 199:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, count + 1, running_loss / 2000))
            running_loss = 0.0
        count += 1

    do_eval(model, valid_data)


[1,   200] loss: 0.324
[1,   400] loss: 0.278
[1,   600] loss: 0.255
[1,   800] loss: 0.240
Valid set accuracy: 0.402996500242
[2,   200] loss: 0.210
[2,   400] loss: 0.202
[2,   600] loss: 0.193
[2,   800] loss: 0.192
Valid set accuracy: 0.515497941108
[3,   200] loss: 0.172
[3,   400] loss: 0.170
[3,   600] loss: 0.167
[3,   800] loss: 0.171
Valid set accuracy: 0.572909377034
[4,   200] loss: 0.154
[4,   400] loss: 0.153
[4,   600] loss: 0.152
[4,   800] loss: 0.158
Valid set accuracy: 0.598427265634
[5,   200] loss: 0.142
[5,   400] loss: 0.142
[5,   600] loss: 0.141
[5,   800] loss: 0.149
Valid set accuracy: 0.617734514808
[6,   200] loss: 0.133
[6,   400] loss: 0.133
[6,   600] loss: 0.132
[6,   800] loss: 0.142
Valid set accuracy: 0.635239141785
[7,   200] loss: 0.126
[7,   400] loss: 0.126
[7,   600] loss: 0.125
[7,   800] loss: 0.136
Valid set accuracy: 0.648055034768
[8,   200] loss: 0.119
[8,   400] loss: 0.119
[8,   600] loss: 0.118
[8,   800] loss: 0.130
Valid set accuracy:

## Bi-LSTM
- 实验结果显示使用Bi-LSTM有一定的提升效果, 0.673->0.692
- 在较大数据集上的效果可能更好（Bi-LSTM参数比LSTM多）

In [116]:
class BiLSTMTagger(nn.Module):
    def __init__(self, config):
        super(BiLSTMTagger, self).__init__()
        
        self.embedding = nn.Embedding(config.vocab_size, config.embedding_dim)
        
        self.lstm = nn.LSTM(input_size=config.embedding_dim, 
                            hidden_size=config.hidden_dim, 
                            num_layers= 2,
                            bidirectional=True)
        
        self.hidden2tag = nn.Linear(in_features=config.hidden_dim*2, out_features=config.tagset_size)
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (Variable(torch.zeros(1, 1, config.hidden_dim)),
                Variable(torch.zeros(1, 1, config.hidden_dim)))
    
    
    def forward(self, sentence):
        embed = self.embedding(sentence)
        # lstm input (seq_len, batch, input_size)
        embed = embed.view(len(sentence), 1, -1)
        lstm_out = self.lstm(embed)[0]
#         print(lstm_out.size())
        # output (seq_len, batch, hidden_size * num_directions)
        # -> (seq_len, hidden_size * num_directions)
        lstm_out_reshape = lstm_out.view(len(sentence), -1)
#         print(lstm_out_reshape.size())
        tag_space = self.hidden2tag(lstm_out_reshape)
        tag_scores = F.log_softmax(tag_space)
        
        return tag_scores

In [117]:
model = BiLSTMTagger(config)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)


for epoch in range(10):
    count = 0
    running_loss = 0.0
    for sentence, tags in training_data:
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_seq(sentence, word2index)
        targets = prepare_seq(tags, tag2index)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.data[0]
        if count % 200 == 199:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, count + 1, running_loss / 2000))
            running_loss = 0.0
        count += 1

    do_eval(model, valid_data)

[1,   200] loss: 0.320
[1,   400] loss: 0.294
[1,   600] loss: 0.284
[1,   800] loss: 0.279
Valid set accuracy: 0.239784096097
[2,   200] loss: 0.267
[2,   400] loss: 0.260
[2,   600] loss: 0.251
[2,   800] loss: 0.242
Valid set accuracy: 0.394927813154
[3,   200] loss: 0.215
[3,   400] loss: 0.205
[3,   600] loss: 0.195
[3,   800] loss: 0.192
Valid set accuracy: 0.508007925312
[4,   200] loss: 0.173
[4,   400] loss: 0.168
[4,   600] loss: 0.161
[4,   800] loss: 0.167
Valid set accuracy: 0.554756792656
[5,   200] loss: 0.150
[5,   400] loss: 0.146
[5,   600] loss: 0.140
[5,   800] loss: 0.149
Valid set accuracy: 0.607302061207
[6,   200] loss: 0.133
[6,   400] loss: 0.130
[6,   600] loss: 0.125
[6,   800] loss: 0.136
Valid set accuracy: 0.628884194833
[7,   200] loss: 0.120
[7,   400] loss: 0.117
[7,   600] loss: 0.114
[7,   800] loss: 0.126
Valid set accuracy: 0.647488870077
[8,   200] loss: 0.109
[8,   400] loss: 0.107
[8,   600] loss: 0.104
[8,   800] loss: 0.117
Valid set accuracy:

## Augmenting the LSTM part-of-speech tagger with character-level features

To do this, let $c_w$ be the character-level representation of
word $w$. Let $x_w$ be the word embedding as before. Then
the input to our sequence model is the concatenation of $x_w$ and
$c_w$. So if $x_w$ has dimension 16, and $c_w$
dimension 8, then our LSTM should accept an input of dimension 24.

To get the character level representation, do an LSTM over the
characters of a word, and let $c_w$ be the final hidden state of
this LSTM. Hints:

* There are going to be two LSTM's in your new model.
  The original one that outputs POS tag scores, and the new one that
  outputs a character-level representation of each word.
* To do a sequence model over characters, you will have to embed characters.
  The character embeddings will be the input to the character LSTM.

---
实验结果显示提升效果非常明显: 0.673 -> 0.782

In [6]:
char2index = dict()
char2index['-'] = 0  # char = '-' if char is not ascii_letters
for i, c in enumerate(string.ascii_letters):
    char2index[c] = i+1
print(char2index)

{'-': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52}


In [48]:
def prepare_char_seq(word, char2index):
    indexs = []
    for c in word:
        if c not in char2index:
            indexs.append(0)
        else:
            indexs.append(char2index[c])
    tensor = torch.LongTensor(indexs)
    return Variable(tensor)

In [40]:
print(training_data[0][0][0])
print(prepare_char_seq(training_data[0][0][0], char2index))

Natural
Variable containing:
 40
  1
 20
 21
 18
  1
 12
[torch.LongTensor of size 7]



In [19]:
class ConfigPlus():
    def __init__(self):
        self.charset_size = len(char2index)
        self.vocab_size = len(word2index)
        self.tagset_size = len(tag2index)
        self.word_embedding_dim = 16
        self.char_embedding_dim = 8
        self.char_hidden_dim = 8
        self.hidden_dim = 16

config = ConfigPlus()

In [81]:
class CharLSTMTagger(nn.Module):
    def __init__(self, config):
        super(CharLSTMTagger, self).__init__()
        
        self.word_embedding = nn.Embedding(config.vocab_size, config.word_embedding_dim)
        self.char_embedding = nn.Embedding(config.charset_size, config.char_embedding_dim)
        
        self.char_lstm = nn.LSTM(input_size=config.char_embedding_dim, 
                                 hidden_size=config.char_hidden_dim)
        
        self.lstm = nn.LSTM(input_size=config.word_embedding_dim+config.char_hidden_dim,
                            hidden_size=config.hidden_dim)
        
        self.hidden2tag = nn.Linear(in_features=config.hidden_dim, out_features=config.tagset_size)
    
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (Variable(torch.zeros(1, 1, config.hidden_dim)),
                Variable(torch.zeros(1, 1, config.hidden_dim)))
    
    
    def forward(self, sentence, words):
        word_embed = self.word_embedding(sentence)
        # lstm input (seq_len, batch, input_size)
        word_embed = word_embed.view(len(sentence), 1, -1)
        
        seg_char_embed = []
        for word in words:
            word_chars = prepare_char_seq(word, char2index)
            char_embed = self.char_embedding(word_chars)
            char_embed = char_embed.view(len(word_chars), 1, -1)
            # character-level representation of each word is
            # the final hidden state of char_lstm. 
            word_char_embed = self.char_lstm(char_embed)[1][0]  # h_n(the final hidden state of char_lstm.)
            
            seg_char_embed.append(word_char_embed)
        #  (seq_len, batch, char_hidden_dim)
        seg_char_embed = torch.cat(seg_char_embed, dim=0)
        # new_embed_dim = word_embedding_dim + char_hidden_dim(the dim of character-level representation)
        embed = torch.cat((word_embed, seg_char_embed), dim=2)
        
        lstm_out = self.lstm(embed)[0]
#         print(lstm_out.size())
        # output (seq_len, batch, hidden_size * num_directions)
        # -> (seq_len, hidden_size * num_directions)
        lstm_out_reshape = lstm_out.view(len(sentence), -1)
#         print(lstm_out_reshape.size())
        tag_space = self.hidden2tag(lstm_out_reshape)
        tag_scores = F.log_softmax(tag_space)
        
        return tag_scores

In [83]:
def do_eval(model, valid_data):
    acc = 0.0
    for sentence, tags in valid_data:
        sentence_in = prepare_seq(sentence, word2index)
        targets = prepare_seq(tags, tag2index)
        tag_scores = model(sentence_in, sentence)
        _, predicted = torch.max(tag_scores.data, 1)
        predicted = predicted.view(len(targets)).numpy()
        targets = targets.data.numpy()
        correct_num = np.sum((predicted == targets))
        acc += correct_num / len(sentence)
        
    print("Valid set accuracy:", acc / len(valid_data))

In [87]:
model = CharLSTMTagger(config)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
train_size = len(training_data)
valid_size = len(valid_data)
print("training_data size: ", train_size)
print("valid_data size: ", valid_size)

# training_data = training_data[:train_size//2]

for epoch in range(10):
    count = 0
    running_loss = 0.0
    for sentence, tags in training_data:
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_seq(sentence, word2index)
        targets = prepare_seq(tags, tag2index)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in, sentence)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.data[0]
        if count % 200 == 199:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, count + 1, running_loss / 2000))
            running_loss = 0.0
        count += 1
    do_eval(model, valid_data)


training_data size:  910
valid_data size:  391
[1,   200] loss: 0.323
[1,   400] loss: 0.281
[1,   600] loss: 0.253
[1,   800] loss: 0.236
Valid set accuracy: 0.433072208189
[2,   200] loss: 0.206
[2,   400] loss: 0.196
[2,   600] loss: 0.186
[2,   800] loss: 0.185
Valid set accuracy: 0.552397303525
[3,   200] loss: 0.160
[3,   400] loss: 0.152
[3,   600] loss: 0.142
[3,   800] loss: 0.144
Valid set accuracy: 0.646899055425
[4,   200] loss: 0.125
[4,   400] loss: 0.122
[4,   600] loss: 0.118
[4,   800] loss: 0.123
Valid set accuracy: 0.682827064641
[5,   200] loss: 0.109
[5,   400] loss: 0.107
[5,   600] loss: 0.104
[5,   800] loss: 0.110
Valid set accuracy: 0.70766459289
[6,   200] loss: 0.096
[6,   400] loss: 0.096
[6,   600] loss: 0.092
[6,   800] loss: 0.100
Valid set accuracy: 0.731857723941
[7,   200] loss: 0.086
[7,   400] loss: 0.087
[7,   600] loss: 0.083
[7,   800] loss: 0.092
Valid set accuracy: 0.749488303579
[8,   200] loss: 0.079
[8,   400] loss: 0.081
[8,   600] loss: 0.