# Sequence Models and Long-Short Term Memory Networks
----------------------------------------------------------
At this point, we have seen various feed-forward networks. That is, there is no state maintained by the network at all. This might not be the behavior we want. Sequence models are central to NLP: they are models where there is some sort of dependence through time between your inputs. The classical example of a sequence model is the Hidden Markov Model for part-of-speech tagging. Another example is the conditional random field.

A recurrent neural network is a network that maintains some kind of state. For example, its output could be used as part of the next input, so that information can propogate along as the network passes over the sequence. In the case of an LSTM, for each element in the sequence, there is a corresponding hidden state ht, which in principle can contain information from arbitrary points earlier in the sequence. We can use the hidden state to predict words in a language model, part-of-speech tags, and a myriad of other things.



In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
import numpy as np

torch.manual_seed(1)

<torch._C.Generator at 0x7ff11c119610>

In [2]:
lstm = nn.LSTM(3, 3)

# Making input of Seq 5, batch size of 1 each having 3 words
inputs = [torch.randn(1,3) for _ in range(5)] 

# initialize the hidden layer
# our of this one is the initial hidden state
# and another is the initial cell state
hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))

for i in inputs:
    # Step through the sequence onc element at a time
    # after each step hidden contain the hidden state
    out, hidden = lstm(i.view(1, 1, -1), hidden)
    
# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))
out, hidden = lstm(inputs, hidden)
print("All Hidden States :\n{}".format(out))
print('-' * 100)
print("The latest/last hidden state :\n{}".format(hidden))
print('-' * 100)

All Hidden States :
tensor([[[-0.0187,  0.1713, -0.2944]],

        [[-0.3521,  0.1026, -0.2971]],

        [[-0.3191,  0.0781, -0.1957]],

        [[-0.1634,  0.0941, -0.1637]],

        [[-0.3368,  0.0959, -0.0538]]])
----------------------------------------------------------------------------------------------------
The latest/last hidden state :
(tensor([[[-0.3368,  0.0959, -0.0538]]]), tensor([[[-0.9825,  0.4715, -0.0633]]]))
----------------------------------------------------------------------------------------------------


In [3]:
def prepare_data(seq, to_ix):
    idsx = [to_ix[w] for w in seq]
    return torch.tensor(idsx, dtype = torch.long)

training_data = [
    ('The dog ate the apple'.split(), ['DET', 'NN', 'V', 'DET', 'NN']),
    ('Everybody read that newspaper'.split(), ['NN', 'V', 'DET', 'NN'])
]

word_to_ix = {}

for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
print(word_to_ix)

tags_to_ix = {'DET' : 0, 'NN' : 1, 'V' : 2}
ix_to_tags = {0 : 'DET', 1 : 'NN', 2 : 'V'}

EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'newspaper': 8}


In [4]:
class LSTMTagger(nn.Module):
    def __init__(self, embedd_dim, hidden_dim, vocab_size, tag_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedd_dim)
        
        # The LSTM takes word embeddings as input and output hidden layer
        # with hidden dimesion
        self.lstm = nn.LSTM(embedd_dim, hidden_dim)
        
        # The linear layer that map the hidden layer to the tag space
        self.hidden2tag = nn.Linear(hidden_dim, tag_size)
        # Initialise the initial state layer and cell state
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # Initialising the initial hidden state and cell state
        # The axes semanntics are (num_layers, mini_batch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim), 
                torch.zeros(1, 1, self.hidden_dim))
    
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = f.log_softmax(tag_space, dim = 1)
        
        return tag_scores

In [5]:
model = LSTMTagger(embedd_dim = EMBEDDING_DIM, hidden_dim = HIDDEN_DIM, 
                   vocab_size = len(word_to_ix), tag_size = len(tags_to_ix))

loss_function = nn.NLLLoss()

optimizer = optim.SGD(model.parameters(), lr = 0.1)

In [6]:
print('-'* 100)
print('MODEL PARAMETERS : \n{}'.format(model.parameters))
print('-'* 100)
print('LOSS FUNCTION : {}'.format(loss_function))
print('-'* 100)
print('OPTIMIZER : \n{}'.format(optimizer))
print('-'* 100)

----------------------------------------------------------------------------------------------------
MODEL PARAMETERS : 
<bound method Module.parameters of LSTMTagger(
  (word_embeddings): Embedding(9, 6)
  (lstm): LSTM(6, 6)
  (hidden2tag): Linear(in_features=6, out_features=3, bias=True)
)>
----------------------------------------------------------------------------------------------------
LOSS FUNCTION : NLLLoss()
----------------------------------------------------------------------------------------------------
OPTIMIZER : 
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.1
    momentum: 0
    nesterov: False
    weight_decay: 0
)
----------------------------------------------------------------------------------------------------


In [7]:
# See what the scores are before training
# Note: The element (i,j) is the score for tag j for word i
# Here we are wrapping the code in torch.no_grad()
with torch.no_grad():
    inputs = prepare_data(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-1.1389, -1.2024, -0.9693],
        [-1.1065, -1.2200, -0.9834],
        [-1.1286, -1.2093, -0.9726],
        [-1.1190, -1.1960, -0.9916],
        [-1.0137, -1.2642, -1.0366]])


In [8]:
# Training LSTM

for i in range(300):
    for sentence, tags in training_data:
        # Step 1: Remember that PyTorch accumulates gradients.
        # We nee dto clear them before each instance
        model.zero_grad()
        
        # Also we need to clear the history of the hidden layer.
        # Detaching it from the hidden state of last instance
        model.hidden = model.init_hidden()
        
        # Step 2: Get our input tensor readyfor the network, i.e.,
        # turn them into vectors of word indices
        sentence_in = prepare_data(sentence, word_to_ix)
        targets = prepare_data(tags, tags_to_ix)
        
        # Steps 3: Run our forward pass 
        tag_scores = model(sentence_in)
        
        # Step 4: Compute loss
        loss = loss_function(tag_scores, targets)
        
        # Step 5: Compute Gradient
        loss.backward()
        
        # Step 6: Optimizer.step()
        optimizer.step()
        
    if i % 10 == 0:
        print('loss at epoch {} is {}'.format(i, loss*100))

loss at epoch 0 is 114.30072784423828
loss at epoch 10 is 106.49176788330078
loss at epoch 20 is 102.92794799804688
loss at epoch 30 is 99.87355041503906
loss at epoch 40 is 95.85249328613281
loss at epoch 50 is 89.87005615234375
loss at epoch 60 is 81.1834487915039
loss at epoch 70 is 69.76107788085938
loss at epoch 80 is 57.036285400390625
loss at epoch 90 is 45.24212646484375
loss at epoch 100 is 35.56020736694336
loss at epoch 110 is 28.075408935546875
loss at epoch 120 is 22.45100975036621
loss at epoch 130 is 18.260948181152344
loss at epoch 140 is 15.123881340026855
loss at epoch 150 is 12.744762420654297
loss at epoch 160 is 10.910616874694824
loss at epoch 170 is 9.471939086914062
loss at epoch 180 is 8.324309349060059
loss at epoch 190 is 7.394327163696289
loss at epoch 200 is 6.629767417907715
loss at epoch 210 is 5.992938995361328
loss at epoch 220 is 5.456202030181885
loss at epoch 230 is 4.9989914894104
loss at epoch 240 is 4.605775833129883
loss at epoch 250 is 4.2646608

In [9]:
# Weights After Training
with torch.no_grad():
    inputs = prepare_data(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

tensor([[-0.0858, -2.9355, -3.5374],
        [-5.2313, -0.0234, -4.0314],
        [-3.9098, -4.1279, -0.0368],
        [-0.0187, -4.7809, -4.5960],
        [-5.8170, -0.0183, -4.1879]])


In [10]:
test_sentence = 'Everybody ate that apple'.split()
with torch.no_grad():
    inputs = prepare_data(test_sentence, word_to_ix)
    tag_scores = model(inputs)
    
    tags = [ix_to_tags[torch.argmax(i).item()] for i in tag_scores]
    
    for word, tag in zip(test_sentence, tags):
        print('{} : {}'.format(word, tag))

Everybody : NN
ate : V
that : DET
apple : NN


**DONE!!**