In [2]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
def prepare_sequence(seq, to_idx):
    idxs = [to_idx[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

def prepare_chars(seq, to_idx):
    idxs = [to_idx[c] for w in seq for c in w]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

training_data = [
    ('The dog ate the apple'.split(), ['DET', 'NN', 'V', 'DET', 'NN']),
    ('Everybody read that book'.split(), ['NN', 'V', 'DET', 'NN'])
]
word_to_idx = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)
print(word_to_idx)
tag_to_idx = {'DET': 0, 'NN': 1, 'V': 2}

char_to_idx = {}
for sent, tags in training_data:
    for word in sent:
        for char in word:
            if char not in char_to_idx:
                char_to_idx[char] = len(char_to_idx)
print(char_to_idx)

{'The': 0, 'dog': 1, 'Everybody': 5, 'apple': 4, 'ate': 2, 'the': 3, 'that': 7, 'book': 8, 'read': 6}
{'a': 6, 'b': 14, 'd': 3, 'r': 12, 'l': 9, 'y': 13, 'E': 10, 't': 7, 'o': 4, 'e': 2, 'T': 0, 'p': 8, 'v': 11, 'g': 5, 'k': 15, 'h': 1}


In [9]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.embeds = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM layer. Takes in the emdding_dim and outputs the output and hidden state with hidden_dims
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        
        # The linear layer that goes from the hidden_dim to the tag dim
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))
    
    def forward(self, sentence):
        embeds = self.embeds(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim = 1)
        return tag_scores

In [10]:
lstm_model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(tag_to_idx))
loss_fn = nn.NLLLoss()
optimizer = optim.SGD(lstm_model.parameters(), lr = 0.1)

In [12]:
# Checking model
inputs = prepare_sequence(training_data[0][0], word_to_idx)
tag_scores = lstm_model(inputs)
print(tag_scores)

Variable containing:
-0.9891 -1.4370 -0.9405
-0.9828 -1.4138 -0.9610
-1.0182 -1.4499 -0.9060
-1.0130 -1.4432 -0.9145
-1.0858 -1.6048 -0.7735
[torch.FloatTensor of size 5x3]



In [17]:
# Train model
for epoch in range(300):
    for sentence, tags in training_data:
        # Clear out prev gradients and hidden states
        lstm_model.zero_grad()
        lstm_model.hidden = lstm_model.init_hidden()
        
        # Get sentences and target tags
        sentence_in = prepare_sequence(sentence, word_to_idx)
        targets = prepare_sequence(tags, tag_to_idx)
        
        # Get scores 
        tag_scores = lstm_model(sentence_in)
        
        # Backprop and update
        loss = loss_fn(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
inputs = prepare_sequence(training_data[0][0], word_to_idx)
tag_scores = lstm_model(inputs)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print(tag_scores)

Variable containing:
-0.0150 -5.9274 -4.4041
-7.3409 -0.0017 -6.8742
-4.6537 -4.7948 -0.0180
-0.0060 -5.7922 -5.8216
-5.6614 -0.0042 -7.2186
[torch.FloatTensor of size 5x3]



In [13]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
WORD_EMBEDDING_DIM = 6
CHAR_EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class LSTMTaggerCharacterLevel(nn.Module):
    
    def __init__(self, word_embedding_dim, char_embedding_dim, hidden_dim, vocab_size, tagset_size, char_size):
        super(LSTMTaggerCharacterLevel, self).__init__()
        self.hidden_dim = hidden_dim
        self.char_hidden_dim = char_embedding_dim
        self.word_embeds = nn.Embedding(vocab_size, word_embedding_dim)
        
        # LSTM character embedding layer
        self.char_embeds = nn.Embedding(char_size, char_embedding_dim)
        self.char_lstm = nn.LSTM(char_embedding_dim, self.char_hidden_dim)
        
        # LSTM layer. Takes in the emdding_dim and outputs the output and hidden state with hidden_dims
        self.lstm = nn.LSTM(word_embedding_dim + self.char_hidden_dim, hidden_dim)
        
        # The linear layer that goes from the hidden_dim to the tag dim
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
        self.char_hidden = self.init_hidden_char()
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))
    
    def init_hidden_char(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.char_hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.char_hidden_dim)))
    
    def forward(self, sentence):
        sent, chars = sentence
        word_embeds = self.word_embeds(sent)
        char_embeds = self.char_embeds(chars)
        
        char_lstm, self.char_hidden = self.char_lstm(char_embeds.view(len(chars), 1, -1), self.char_hidden)
        
        embeds = (word_embeds.view(len(sent), 1, -1), char_lstm)
        newEmbeds = torch.cat(embeds)
        print(newEmbeds)
        lstm_out, self.hidden = self.lstm(newEmbeds, self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim = 1)
        return tag_scores

In [14]:
lstm_model = LSTMTaggerCharacterLevel(WORD_EMBEDDING_DIM, CHAR_EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(tag_to_idx), len(char_to_idx))
loss_fn = nn.NLLLoss()
optimizer = optim.SGD(lstm_model.parameters(), lr = 0.1)

In [15]:
# Checking model
sentences = prepare_sequence(training_data[0][0], word_to_idx)
chars = prepare_chars(training_data[0][0], char_to_idx)
print(chars)
tag_scores = lstm_model((sentences, chars))
print(tag_scores)

Variable containing:
 0
 1
 2
 3
 4
 5
 6
 7
 2
 7
 1
 2
 6
 8
 8
 9
 2
[torch.LongTensor of size 17]

Variable containing:
(0 ,.,.) = 
 -0.5956 -0.3184 -1.3587  1.1305 -1.1718  0.1866

(1 ,.,.) = 
 -1.3745  0.3103  0.4875  0.3672  1.5561 -0.7710

(2 ,.,.) = 
 -1.8214  1.5906 -0.0219  1.2544  0.4263  1.0485

(3 ,.,.) = 
 -0.0857  1.2534  0.8897  0.0874 -0.3521  0.8990

(4 ,.,.) = 
 -1.6888  0.7444  0.5939  0.2659 -1.3811  1.9019

(5 ,.,.) = 
 -0.0036  0.1841 -0.2207 -0.1683  0.2294 -0.0059

(6 ,.,.) = 
 -0.0477  0.3186 -0.2542 -0.4799 -0.0958 -0.1068

(7 ,.,.) = 
 -0.1241  0.3573 -0.3706 -0.2819 -0.1856 -0.0650

(8 ,.,.) = 
 -0.0293  0.3156 -0.2956 -0.2327 -0.2180 -0.0780

(9 ,.,.) = 
 -0.0002  0.2579 -0.4332 -0.3703  0.0912 -0.1054

(10,.,.) = 
 -0.1254  0.5914 -0.1589 -0.3247  0.0861  0.2230

(11,.,.) = 
 -0.1225  0.5581 -0.3958 -0.3133  0.0930  0.1756

(12,.,.) = 
 -0.1373  0.5367 -0.2539 -0.3663 -0.2124  0.0731

(13,.,.) = 
 -0.1365  0.4805 -0.4008 -0.2697 -0.2326  0.0563

(14,.,.)

RuntimeError: size mismatch, m1: [1 x 6], m2: [12 x 24] at /pytorch/torch/lib/TH/generic/THTensorMath.c:1416