In [1]:
import torch
from torch import nn
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

# LSTM

In [None]:
# create some time series data
# 1d time series: each element in the array is a different time point
inputs = torch.randn((200,1,1)) # LSTM input dimension is (sequence length, batch size, input size) when batch_first=False, see detials in docs
# target is a sliding window mean of the past 'navg' elements
targets = torch.zeros((200,1,1))

navg = 5
for ii in range(navg, inputs.shape[0]):
    targets[ii] = inputs[ii - navg:ii].mean()

In [3]:
class MyLSTM(nn.Module):
    def __init__(self):
        super(MyLSTM, self).__init__()
        self.lstm = torch.nn.LSTM(1, 10) # input: 1 dim, hidden size: 10 dim
        # what's the output size of our LSTM?
        self.output_layer = torch.nn.Linear(10, 1) # use a linear layer to project back to the dimension of our data
        
    def forward(self, x, hc_0):
        # hc_0 needs to be a tuple of hidden and cell states, (h_0, c_0)
        # hc_n is also a tuple of the final hidden state and cell states
        lstm_outputs, hc_n = self.lstm(x, hc_0)
        outputs = self.output_layer(lstm_outputs) # plug the output of LSTM as the input to the linear layer
        return outputs, hc_n

In [4]:
model = MyLSTM()
criterion = torch.nn.MSELoss() # Why MSE loss? 
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [5]:
nepoch = 100

hc_0 = (torch.zeros((1,1,10)), torch.zeros((1,1,10))) # (num layer, batch size, hidden size)

for epoch in range(nepoch):
    optimizer.zero_grad()
    outputs, hc_n = model(inputs, hc_0)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    if not epoch%10:
        print("Loss(%d):"%epoch, loss.mean())

Loss(0): tensor(0.1634, grad_fn=<MeanBackward0>)
Loss(10): tensor(0.1175, grad_fn=<MeanBackward0>)
Loss(20): tensor(0.0971, grad_fn=<MeanBackward0>)
Loss(30): tensor(0.0472, grad_fn=<MeanBackward0>)
Loss(40): tensor(0.0240, grad_fn=<MeanBackward0>)
Loss(50): tensor(0.0184, grad_fn=<MeanBackward0>)
Loss(60): tensor(0.0151, grad_fn=<MeanBackward0>)
Loss(70): tensor(0.0140, grad_fn=<MeanBackward0>)
Loss(80): tensor(0.0130, grad_fn=<MeanBackward0>)
Loss(90): tensor(0.0122, grad_fn=<MeanBackward0>)


# batching

In [6]:
# same input but now batch size = 5
inputs_b = torch.randn((200,5,1)) # LSTM input dimension is (sequence length, batch size, input size)
targets_b = torch.zeros((200,5,1))

navg = 5
for ii in range(navg, inputs_b.shape[0]):
    targets_b[ii] = inputs_b[ii - navg:ii].mean()

In [7]:
# we can use the same LSTM because it is set up regardless of batch size
nepoch = 100
# but we need to set up the hidden states and cell states to have the right dimension! 
hc_0_b = (torch.zeros((1,5,10)), torch.zeros((1,5,10))) # (num layer, batch size, hidden size)

for epoch in range(nepoch):
    optimizer.zero_grad()
    outputs_b, hc_n_b = model(inputs_b, hc_0_b)
    loss = criterion(outputs_b, targets_b)
    loss.backward()
    optimizer.step()
    if not epoch%10:
        print("Loss(%d):"%epoch, loss.mean())

Loss(0): tensor(0.1213, grad_fn=<MeanBackward0>)
Loss(10): tensor(0.0289, grad_fn=<MeanBackward0>)
Loss(20): tensor(0.0294, grad_fn=<MeanBackward0>)
Loss(30): tensor(0.0285, grad_fn=<MeanBackward0>)
Loss(40): tensor(0.0277, grad_fn=<MeanBackward0>)
Loss(50): tensor(0.0270, grad_fn=<MeanBackward0>)
Loss(60): tensor(0.0266, grad_fn=<MeanBackward0>)
Loss(70): tensor(0.0263, grad_fn=<MeanBackward0>)
Loss(80): tensor(0.0261, grad_fn=<MeanBackward0>)
Loss(90): tensor(0.0260, grad_fn=<MeanBackward0>)


# truncated BPTT

In [8]:
# now 2000 time steps instead of 200! 
inputs2 = torch.randn((2000,1,1))
targets2 = torch.zeros((2000,1,1))

navg = 5
for ii in range(navg, inputs2.shape[0]):
    targets2[ii] = inputs2[ii - navg:ii].mean()

In [None]:
nepoch = 5
# MyLSTM class doesn't depend on the sequence length, so you can give it variable sequence length!
seq_len = 100 # how long should each training sequence be? 

for epoch in range(nepoch):
    data_ptr = 0 # where should we start on this BPTT run?
    hc = (torch.zeros((1,1,10)), torch.zeros((1,1,10))) # (num layer, batch size, hidden size)
    while True:
        optimizer.zero_grad()
        outputs, hc = model(inputs2[data_ptr:data_ptr+seq_len], (hc[0].detach(),hc[1].detach())) # DETACH!!!
        # if do not detach, it'll complain that you've already back propped through the graph, because we're reusing the same h
        loss = criterion(outputs, targets2[data_ptr:data_ptr+seq_len])
        loss.backward()
        optimizer.step()
        
        data_ptr += seq_len
        if data_ptr + seq_len > inputs2.shape[0]:
            data_ptr = 0
            break # out of the while True loop
    
    print("Loss(%d):"%epoch, loss.mean())

Loss(0): tensor(0.0769, grad_fn=<MeanBackward0>)
Loss(1): tensor(0.0478, grad_fn=<MeanBackward0>)
Loss(2): tensor(0.0332, grad_fn=<MeanBackward0>)
Loss(3): tensor(0.0250, grad_fn=<MeanBackward0>)
Loss(4): tensor(0.0213, grad_fn=<MeanBackward0>)
