In [97]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

np.set_printoptions(suppress=True, precision=4)
torch.set_printoptions(precision=4, sci_mode=False)
torch.manual_seed(1)

from sklearn.metrics import r2_score

In [2]:
# lstm with input dim 3 and output dim 3
lstm = nn.LSTM(3,3)

In [3]:
# define inputs
inputs = [torch.randn(1,3) for _ in range(5)]
inputs

[tensor([[-0.5525,  0.6355, -0.3968]]),
 tensor([[-0.6571, -1.6428,  0.9803]]),
 tensor([[-0.0421, -0.8206,  0.3133]]),
 tensor([[-1.1352,  0.3773, -0.2824]]),
 tensor([[-2.5667, -1.4303,  0.5009]])]

In [4]:
# hidden state
# 3 dimensions: 
# 1 the sequence itself, 2 the indexes in batch, 3 the indexes in the input
hidden = (torch.randn(1,1,3), torch.randn(1,1,3 ))
hidden

(tensor([[[ 0.5438, -0.4057,  1.1341]]]),
 tensor([[[-1.1115,  0.3501, -0.7703]]]))

In [5]:
# for each input
for i in inputs:
    # get output and hidden layer
    # hidden layer is passed to the next input, but in this example only the output matters
    out, hidden = lstm(i.view(1,1,-1), hidden)
    


In [6]:
# alternative
inputs = torch.randn(5,1,3)
# torch.cat(inputs) - alternative way to transform the original input - use concat
# hidden is a two tuple with two set of weights - presumably one for the current state, and one for last state
# what about weights from lstm gates?

hidden = (torch.randn(1,1,3), torch.randn(1,1,3 ))
out, hidden = lstm(inputs.view(5,1,3), hidden)
# output is the same format as inputs

In [7]:
inputs

tensor([[[-0.1473,  0.6272,  1.0935]],

        [[ 0.0939,  1.2381, -1.3459]],

        [[ 0.5119, -0.6933, -0.1668]],

        [[-0.9999, -1.6476,  0.8098]],

        [[ 0.0554,  1.1340, -0.5326]]])

In [8]:
out

tensor([[[-0.6383, -0.1405, -0.1033]],

        [[-0.1460, -0.0367, -0.2437]],

        [[-0.4672, -0.0584, -0.2693]],

        [[-0.5145,  0.0216, -0.2656]],

        [[-0.1504,  0.0266, -0.1379]]], grad_fn=<StackBackward>)

In [9]:
hidden

(tensor([[[-0.1504,  0.0266, -0.1379]]], grad_fn=<StackBackward>),
 tensor([[[-0.9364,  0.0484, -0.4176]]], grad_fn=<StackBackward>))

## Using LSTM for part-of-speech tagging

In [10]:
# take the log softmax of the affine map of the hidden state
# the predicted tag is the tag that the maximum value in thisvector


In [11]:
# convert text to numbers
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# training is sequence of words and their tags
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix) # always next in sequence
            
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
print(tag_to_ix)

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
{'DET': 0, 'NN': 1, 'V': 2}


In [12]:
# create the model
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        
        # initialize the hidden dimension
        # what is this for? seem to have no use at all
        # self.hidden_dim = hidden_dim

        # pass vocabulary through and embedding layer
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        # missing a linear layer
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        # convert sentence into embeddings
        # this part would be optional, can I do directly with one hot encoding?
        embeds = self.word_embeddings(sentence)
        # get output from lstm - what about the hidden layer?
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        # convert lstm output to tag space, linear transformation
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        # apply softmax to get the probabilities
        # missing a softmax if doing classification
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [13]:
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tags = prepare_sequence(training_data[0][1], tag_to_ix)
inputs, tags

(tensor([0, 1, 2, 3, 4]), tensor([0, 1, 2, 0, 1]))

In [14]:
# train the model
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(torch.argmax(tag_scores, dim=1))

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    targets = prepare_sequence(training_data[0][1], tag_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(torch.argmax(tag_scores, dim=1))
    print(targets)

tensor([2, 2, 2, 2, 2])
tensor([0, 1, 2, 0, 1])
tensor([0, 1, 2, 0, 1])


## Simple example - can lstm learn the sine function from 10 examples of sequences?

In [235]:
# create the model
class LSTMSeq(nn.Module):

    def __init__(self, seq_length, hidden_dim, target_size):
        super(LSTMSeq, self).__init__()
        
        self.lstm_layer = nn.LSTM(seq_length, hidden_dim)
        self.non_linearity = nn.ReLU()        
        self.fc1 = nn.Linear(hidden_dim, int(hidden_dim/2))
        self.fc2 = nn.Linear(int(hidden_dim/2), int(hidden_dim/5))
        self.output_layer = nn.Linear(int(hidden_dim/5), target_size)

    def forward(self, sequence):
        n_samples = 1
        if type(sequence) == list:
            n_samples = len(sequence)
            sequence = torch.cat(sequence)        
        lstm_out, _ = self.lstm_layer(sequence.view(n_samples,1,-1)) # lstm
        x = self.fc1(lstm_out) # hidden
        x = self.non_linearity(x) # activation
        x = self.fc2(x) # hidden
        x = self.non_linearity(x) # activation
        predictions = self.output_layer(x) # output
        return predictions

# lstm has 3 dimensions: 
# 1 the sequence itself, 2 the indexes in batch, 3 the indexes in the input
    

In [236]:
# hyperparams
n_samples = 20
hidden_dim = 500
seq_length = 10
batch_size = 1

# define input
inputs = [ torch.FloatTensor(range(i*seq_length, (i+1)*seq_length)) for i in range(n_samples) ]
targets = [torch.sin(i) for i in inputs]
# inputs, targets, inputs[0].dtype, targets[0].dtype

# define model
model = LSTMSeq(seq_length=seq_length, hidden_dim=hidden_dim, target_size=seq_length)
model(inputs[0]).squeeze()


tensor([ 0.0441, -0.0280, -0.0333,  0.0033, -0.0195,  0.0507,  0.0914, -0.0480,
        -0.0082,  0.0422], grad_fn=<SqueezeBackward0>)

In [237]:
# inputs
# targets
# r2_score(model(inputs).squeeze().tolist(), [t.tolist() for t in targets])

In [238]:
# initialize
epochs = 20000
loss_func = nn.MSELoss()
lr = 0.02
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

# training loop
for epoch in range(epochs):
    
    # training loop
    cum_loss = 0
    # setup
    model.zero_grad()

    # calculate loss
    prediction = model(inputs).squeeze()
    loss = loss_func(prediction, torch.stack(targets))
    cum_loss += loss.item()

    # gradient descent
    loss.backward()
    optimizer.step()        

    # report
    if epoch % (epochs/50) == 0:   
        # report
        with torch.no_grad():            
            # get r2score
            score = r2_score(model(inputs).squeeze().tolist(), [t.tolist() for t in targets])
            # print loss
            print("Epoch: {:d}, Loss: {:.4f}, R2: {:.4f}".format(epoch, cum_loss, score))

    # decay 5 times during training
    if epoch % (epochs/5) == 0:
        # decay lr
        lr = lr/2
        for g in optimizer.param_groups:
            g['lr'] = lr
            


Epoch: 0, Loss: 0.5043, R2: -11505.4866
Epoch: 400, Loss: 0.4915, R2: -591.8935
Epoch: 800, Loss: 0.4732, R2: -103.6741
Epoch: 1200, Loss: 0.4353, R2: -14.1146
Epoch: 1600, Loss: 0.4070, R2: -5.4346
Epoch: 2000, Loss: 0.3943, R2: -3.2128
Epoch: 2400, Loss: 0.3751, R2: -2.8642
Epoch: 2800, Loss: 0.3729, R2: -2.1854
Epoch: 3200, Loss: 0.3616, R2: -2.2626
Epoch: 3600, Loss: 0.3471, R2: -1.3999
Epoch: 4000, Loss: 0.3388, R2: -1.2749
Epoch: 4400, Loss: 0.3311, R2: -1.0940
Epoch: 4800, Loss: 0.3267, R2: -1.0602
Epoch: 5200, Loss: 0.3226, R2: -1.0883
Epoch: 5600, Loss: 0.3215, R2: -0.9724
Epoch: 6000, Loss: 0.3195, R2: -0.9515
Epoch: 6400, Loss: 0.3196, R2: -0.8198
Epoch: 6800, Loss: 0.3142, R2: -0.7672
Epoch: 7200, Loss: 0.3087, R2: -0.7866
Epoch: 7600, Loss: 0.3035, R2: -0.6671
Epoch: 8000, Loss: 0.2955, R2: -0.6621
Epoch: 8400, Loss: 0.2839, R2: -0.4455
Epoch: 8800, Loss: 0.2801, R2: -0.4094
Epoch: 9200, Loss: 0.2777, R2: -0.4614
Epoch: 9600, Loss: 0.2733, R2: -0.3094
Epoch: 10000, Loss: 0

In [247]:
r2_score(model(inputs).squeeze().tolist(), [t.tolist() for t in targets])

0.1954750884735427

In [248]:
# for some reason the model is overfitting to the first examples
# what if I randomly sort at each start, or just learn all at once
for input, target in zip(inputs, targets):
    print(r2_score(model(input).squeeze().tolist(), target.tolist()))

0.9999892388821722
-2.8105100120405004
-3.3776683280406443
-96.57121065826844
-27.962900137429855
-26.609077214351046
-92.43727449613884
-71.27234776600149
-36.27828033688154
-58.56023416832904
-38.62535917725062
-62.088026943689265
-114.56060773145401
-228.71235501962315
-996.452558536015
-689.045211578841
-141.55061614845198
-70.77711555111841
-42.28026052794875
-16.900732602288244


In [249]:
list(zip(model(inputs[0]).squeeze().tolist(), targets[0].tolist()))

[(0.004310354590415955, 0.0),
 (0.8413498401641846, 0.8414709568023682),
 (0.9114561080932617, 0.9092974066734314),
 (0.1414901316165924, 0.14112000167369843),
 (-0.7538073062896729, -0.756802499294281),
 (-0.9602295756340027, -0.9589242935180664),
 (-0.27775439620018005, -0.279415488243103),
 (0.6562947630882263, 0.6569865942001343),
 (0.9893290996551514, 0.9893582463264465),
 (0.41515031456947327, 0.41211849451065063)]

In [251]:
list(zip(model(inputs[-11]).squeeze().tolist(), targets[-1].tolist()))

[(-0.16154761612415314, 0.997799277305603),
 (-0.05912990868091583, 0.5949085354804993),
 (0.09348908066749573, -0.3549383580684662),
 (0.13962914049625397, -0.9784565567970276),
 (0.07361795008182526, -0.7023863196372986),
 (-0.054895758628845215, 0.21945466101169586),
 (-0.14959675073623657, 0.939530074596405),
 (-0.08419793844223022, 0.7958058714866638),
 (0.04853467643260956, -0.07957859337329865),
 (0.13360965251922607, -0.8817988634109497)]

In [264]:
input = torch.FloatTensor(np.arange(373, 403, 3))
target = torch.sin(input)
input, target

(tensor([373., 376., 379., 382., 385., 388., 391., 394., 397., 400.]),
 tensor([ 0.7510, -0.8366,  0.9056, -0.9564,  0.9880, -0.9999,  0.9918, -0.9638,
          0.9165, -0.8509]))

In [265]:
input, target

(tensor([373., 376., 379., 382., 385., 388., 391., 394., 397., 400.]),
 tensor([ 0.7510, -0.8366,  0.9056, -0.9564,  0.9880, -0.9999,  0.9918, -0.9638,
          0.9165, -0.8509]))

In [266]:
list(zip(model(input).squeeze().tolist(), target.tolist()))

[(0.367722749710083, 0.7509673237800598),
 (0.022573277354240417, -0.8366391062736511),
 (-0.3374350965023041, 0.9055655598640442),
 (-0.4086197316646576, -0.9563671350479126),
 (-0.0877951830625534, 0.9880269765853882),
 (0.3275204300880432, -0.9999114871025085),
 (0.4304143786430359, 0.9917827248573303),
 (0.15984366834163666, -0.9638034105300903),
 (-0.2661747932434082, 0.9165335893630981),
 (-0.44398248195648193, -0.8509193658828735)]

In [267]:
r2_score(model(input).squeeze().tolist(), target.tolist())

-7.750225448120924