https://github.com/LeanManager/NLP-PyTorch

RNN/LSTN: input (xt) => hidden state (ht)

lstm = nn.LSTM (input_size = input_dimension,
                hiden_size = hiden_dimension,
                num_layers = n_layers)
                
out, hidden = lstm (input, (h0, c0))

input = tensor containing the values in an input sequence (seq_len, batch, input_size)

h0 = a tensor containing the initial hidden state for each element in a batch
c0 = a tensor containing the initial cell memory for each element in the batch
h0 and c0 will default to 0, their dimension are (n_layers*n_directions, batch, hidden_dim)

all of the weights are actually the same as that RNN cell is essentially being re-used throughout the process. 

In [47]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as  np
import matplotlib.pyplot as plt

%matplotlib inline



# RNN/LSTM structure

In [12]:
input_dim = 4
hidden_dim = 3

lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim)

inputs_list = [torch.randn(1, input_dim) for _ in range (5)]
print ('inputs: \n', inputs_list)
print ('\n')

h0 = torch.randn(1,1,hidden_dim)
c0 = torch.randn(1,1,hidden_dim)

for i in inputs_list:
    out, hidden = lstm(i.view(1,1,-1),(h0,c0))
    
    print(i.view(1,1,-1))
    print('out: \n', out)
    print('hidden: \n', hidden)
    print('\n')

inputs: 
 [tensor([[ 1.4585, -0.2669,  0.2690, -0.5728]]), tensor([[ 2.2614, -0.5147,  0.5446, -1.4257]]), tensor([[-9.9630e-01,  1.2686e+00, -2.2741e+00,  2.0916e-03]]), tensor([[ 0.8580, -1.5871, -0.8145,  0.2116]]), tensor([[ 1.0238,  0.4050,  0.4375, -0.1857]])]


tensor([[[ 1.4585, -0.2669,  0.2690, -0.5728]]])
out: 
 tensor([[[-0.1022,  0.5371, -0.3542]]], grad_fn=<StackBackward>)
hidden: 
 (tensor([[[-0.1022,  0.5371, -0.3542]]], grad_fn=<StackBackward>), tensor([[[-0.2796,  1.0876, -0.6428]]], grad_fn=<StackBackward>))


tensor([[[ 2.2614, -0.5147,  0.5446, -1.4257]]])
out: 
 tensor([[[-0.1006,  0.6589, -0.4412]]], grad_fn=<StackBackward>)
hidden: 
 (tensor([[[-0.1006,  0.6589, -0.4412]]], grad_fn=<StackBackward>), tensor([[[-0.4406,  1.2990, -0.6963]]], grad_fn=<StackBackward>))


tensor([[[-9.9630e-01,  1.2686e+00, -2.2741e+00,  2.0916e-03]]])
out: 
 tensor([[[0.0839, 0.4869, 0.2786]]], grad_fn=<StackBackward>)
hidden: 
 (tensor([[[0.0839, 0.4869, 0.2786]]], grad_fn=<StackBac

In [26]:
#using batches

inputs = torch.cat(inputs_list).view(len(inputs_list),1,-1)

print('input size: \n', inputs.size())
print('\n')

print('inputs: \n', inputs)
print('\n')

h0 = torch.randn(1,1,hidden_dim)
c0 = torch.randn(1,1,hidden_dim)

out, hidden = lstm(inputs,(h0,c0))

print('out: \n', out)
print(out.shape)
print('hidden: \n', hidden)

input size: 
 torch.Size([5, 1, 4])


inputs: 
 tensor([[[ 1.4585e+00, -2.6691e-01,  2.6902e-01, -5.7277e-01]],

        [[ 2.2614e+00, -5.1467e-01,  5.4460e-01, -1.4257e+00]],

        [[-9.9630e-01,  1.2686e+00, -2.2741e+00,  2.0916e-03]],

        [[ 8.5801e-01, -1.5871e+00, -8.1452e-01,  2.1161e-01]],

        [[ 1.0238e+00,  4.0501e-01,  4.3746e-01, -1.8571e-01]]])


out: 
 tensor([[[ 2.6436e-02,  4.9025e-01,  3.8313e-01]],

        [[-2.1447e-02,  4.7132e-01,  7.3166e-02]],

        [[ 1.9291e-01,  2.4376e-01,  3.7812e-01]],

        [[-9.3205e-03, -2.4383e-02,  5.7246e-02]],

        [[ 2.8757e-04,  1.0533e-01, -2.5121e-02]]], grad_fn=<StackBackward>)
torch.Size([5, 1, 3])
hidden: 
 (tensor([[[ 0.0003,  0.1053, -0.0251]]], grad_fn=<StackBackward>), tensor([[[ 0.0008,  0.1728, -0.0662]]], grad_fn=<StackBackward>))


# LSTM part-of-speech tagging

In [45]:
training_data = [
    ("The cat ate the cheese".lower().split(), ["DET", "NN", "V", "DET", "NN"]),
    ("She read that book".lower().split(), ["NN", "V", "DET", "NN"]),
    ("The dog loves art".lower().split(), ["DET", "NN", "V", "NN"]),
    ("The elephant answers the phone".lower().split(), ["DET", "NN", "V", "DET", "NN"])
]

word2idx = {}

for sent, tags in training_data:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
            
tag2idx = {"DET":0, "NN":1, "V":2}

In [46]:
print (word2idx)

{'the': 0, 'cat': 1, 'ate': 2, 'cheese': 3, 'she': 4, 'read': 5, 'that': 6, 'book': 7, 'dog': 8, 'loves': 9, 'art': 10, 'elephant': 11, 'answers': 12, 'phone': 13}


In [61]:
def prepare_sequence(seq, to_idx):

    
    idxs = [to_idx[w] for w in seq]
    idxs = np.array(idxs)
    
    return torch.from_numpy(idxs)

In [62]:
exemple_input = prepare_sequence("The dog answers the phone".lower().split(),word2idx)

print(exemple_input)

tensor([ 0,  8, 12,  0, 13])


In [72]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        ''' Initialize the layers of this model.'''
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_dim

        # embedding layer that turns words into a vector of a specified size
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # the LSTM takes embedded word vectors (of a specified size) as inputs 
        # and outputs hidden states of size hidden_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # the linear layer that maps the hidden state output dimension 
        # to the number of tags we want as output, tagset_size (in this case this is 3 tags)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        # initialize the hidden state (see code below)
        self.hidden = self.init_hidden()

        
    def init_hidden(self):
        ''' At the start of training, we need to initialize a hidden state;
           there will be none because the hidden state is formed based on perviously seen data.
           So, this function defines a hidden state with all zeroes and of a specified size.'''
        # The axes dimensions are (n_layers, batch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        ''' Define the feedforward behavior of the model.'''
        # create embedded word vectors for each word in a sentence
        embeds = self.word_embeddings(sentence)
        
        # get the output and hidden state by passing the lstm over our word embeddings
        # the lstm takes in our embeddings and hiddent state
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        
        # get the scores for the most likely tag for a word
        tag_outputs = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_outputs, dim=1)
        
        return tag_scores

In [73]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word2idx), len(tag2idx))

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [74]:
model

LSTMTagger(
  (word_embeddings): Embedding(14, 6)
  (lstm): LSTM(6, 6)
  (hidden2tag): Linear(in_features=6, out_features=3, bias=True)
)

In [75]:
test_sentence = "The cheese loves the elephant".lower().split()

inputs = prepare_sequence(test_sentence, word2idx)
inputs = inputs
tag_scores = model(inputs)
print(tag_scores)

_, predicted_tags = torch.max(tag_scores, 1)
print('\n')
print('predicted tags: \n', predicted_tags)

tensor([[-1.0175, -1.0981, -1.1875],
        [-1.0282, -1.0665, -1.2102],
        [-0.9977, -1.1115, -1.1967],
        [-1.0327, -1.0965, -1.1714],
        [-1.0477, -1.0595, -1.1953]], grad_fn=<LogSoftmaxBackward>)


predicted tags: 
 tensor([0, 0, 0, 0, 0])


In [76]:
# normally these epochs take a lot longer 
# but with our toy data (only 3 sentences), we can do many epochs in a short time
n_epochs = 300

for epoch in range(n_epochs):
    
    epoch_loss = 0.0
    
    # get all sentences and corresponding tags in the training data
    for sentence, tags in training_data:
        
        # zero the gradients
        model.zero_grad()

        # zero the hidden state of the LSTM, this detaches it from its history
        model.hidden = model.init_hidden()

        # prepare the inputs for processing by out network, 
        # turn all sentences and targets into Tensors of numerical indices
        sentence_in = prepare_sequence(sentence, word2idx)
        targets = prepare_sequence(tags, tag2idx)

        # forward pass to get tag scores
        tag_scores = model(sentence_in)

        # compute the loss, and gradients 
        loss = loss_function(tag_scores, targets)
        epoch_loss += loss.item()
        loss.backward()
        
        # update the model parameters with optimizer.step()
        optimizer.step()
        
    # print out avg loss per 20 epochs
    if(epoch%20 == 19):
        print("Epoch: %d, loss: %1.5f" % (epoch+1, epoch_loss/len(training_data)))

Epoch: 20, loss: 1.01532
Epoch: 40, loss: 0.91809
Epoch: 60, loss: 0.75323
Epoch: 80, loss: 0.60342
Epoch: 100, loss: 0.49337
Epoch: 120, loss: 0.38344
Epoch: 140, loss: 0.28446
Epoch: 160, loss: 0.20897
Epoch: 180, loss: 0.15553
Epoch: 200, loss: 0.11909
Epoch: 220, loss: 0.09281
Epoch: 240, loss: 0.07334
Epoch: 260, loss: 0.05892
Epoch: 280, loss: 0.04825
Epoch: 300, loss: 0.04027


In [78]:
test_sentence = "The cheese loves the elephant".lower().split()

# see what the scores are after training
inputs = prepare_sequence(test_sentence, word2idx)
inputs = inputs
tag_scores = model(inputs)
print(tag_scores)

# print the most likely tag index, by grabbing the index with the maximum score!
# recall that these numbers correspond to tag2idx = {"DET": 0, "NN": 1, "V": 2}
_, predicted_tags = torch.max(tag_scores, 1)
print('\n')
print('Predicted tags: \n',predicted_tags)

tensor([[-0.2648, -4.0755, -1.5339],
        [-4.0535, -0.0217, -5.4881],
        [-4.9377, -3.7223, -0.0319],
        [-0.0791, -2.8841, -3.9033],
        [-2.9463, -0.0595, -5.2603]], grad_fn=<LogSoftmaxBackward>)


Predicted tags: 
 tensor([0, 1, 2, 0, 1])
