# Long Short-Term Memory - Sample Implementation and Application

In [1]:
# all imports for the whole notebook
import string
import random
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

## Overview

This tutorial will provide an overview over the concept of LSTMs by providing a Python implementation that does not rely on any framework specifically for neural networks. Thereby all necessary code to implement the LSTM-Cells will be shown. In order to get insight into how LSTMs work internally, a small network will be trained on a simple example task.

## Task description

The task for this LSTM network will be a simple sequence prediction task. Elements of the sequence will be letters from a subset of the alphabet. The sequences are constructed as follows:

1. They start with a small sequence of random characters with varying length.
2. Then there is a marker letter, that will not be used anywhere else in the whole sequence.
3. A random letter follows, which will be called **target**.
4. Another larger random sequence with varying length follows.
5. After this, there is a second marker letter, that will also not be used anywhere else in the whole sequence.
6. The final character is again the **target** letter.

In order to learn the task, the network must recognize the marker letters in order to open/close the gates of the memory cells appropriately. The **target** letter then has to be stored in the network until the second marker appeares. Overall performance of the task can be measured by calculating the average prediction error for the last character in every sequence.

## Data generation

At first we have to setup the general environment for the task, like the list of letters to be used, the markers, and similar initializations.

In [2]:
# marker letters
mark1 = 'a'
mark2 = 'b'

# nums
num_random_letters = 10
num_letters = num_random_letters + 2

# random letters
all_letters = list(string.ascii_lowercase)
letters = all_letters[2:num_random_letters]

print("Markers:", [mark1, mark2])
print("Letters:", letters)

Markers: ['a', 'b']
Letters: ['c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']


Following we need a function that can sample us a list of letters with length in a specific range.

In [3]:
def randomSequenceSample(minlength, maxlength):
    length = random.randint(minlength, maxlength)
    for _ in range(length):
        rand_idx = random.randint(0, len(letters)-1)
        yield letters[rand_idx]

Lets look at some random samples:

In [4]:
for i in range(5):
    print("Test sample", i+1, ":", list(randomSequenceSample(1, 15)))

Test sample 1 : ['h', 'i', 'j', 'f', 'h', 'h', 'e', 'e', 'c', 'h', 'i', 'c', 'e', 'i', 'i']
Test sample 2 : ['d', 'g', 'c', 'i', 'e']
Test sample 3 : ['g', 'h', 'c', 'i', 'e', 'g', 'c', 'f', 'j', 'c', 'i', 'e', 'e', 'f', 'e']
Test sample 4 : ['g']
Test sample 5 : ['h', 'e', 'g', 'e', 'i', 'e']


Now we need to put those random sequence samples together with the markers and target letters in order to get a problem sequence as decribed above.

In [5]:
def generateSequence():
    target_i = random.randint(0, len(letters)-1)
    target = letters[target_i]
    pre = list(randomSequenceSample(0, 3))
    mid = list(randomSequenceSample(1, 5))
    return pre + [mark1, target] + mid + [mark2, target]

Lets look at a few sequences:

In [6]:
for i in range(5):
    print("Sequence test", i+1, ":", generateSequence())

Sequence test 1 : ['c', 'h', 'h', 'a', 'f', 'i', 'e', 'd', 'b', 'f']
Sequence test 2 : ['a', 'i', 'c', 'b', 'i']
Sequence test 3 : ['g', 'h', 'a', 'g', 'c', 'e', 'd', 'e', 'i', 'b', 'g']
Sequence test 4 : ['g', 'a', 'h', 'i', 'g', 'b', 'h']
Sequence test 5 : ['f', 'h', 'a', 'd', 'h', 'j', 'd', 'b', 'd']


In [7]:
def formatSequenceToPrint(s):
    new_s = []
    check_next = False
    for item in s:
        if item == mark1 or item == mark2:
            item = '_'
            check_next = True
        elif check_next == True:
            item = item.upper()
            check_next = False
        new_s.append(item)
    return new_s

In [8]:
for i in range(5):
    print("Sequence test", i+1, ":", formatSequenceToPrint(generateSequence()))

Sequence test 1 : ['j', 'g', 'd', '_', 'F', 'f', 'j', 'f', 'e', 'g', '_', 'F']
Sequence test 2 : ['_', 'G', 'd', 'c', 'e', 'g', '_', 'G']
Sequence test 3 : ['i', 'd', 'h', '_', 'G', 'e', 'h', 'j', 'e', 'j', '_', 'G']
Sequence test 4 : ['j', 'f', 'd', '_', 'D', 'f', '_', 'D']
Sequence test 5 : ['c', 'i', 'h', '_', 'F', 'j', 'h', '_', 'F']


In [9]:
def letterToCategorical(l):
    lst = [0.0] * num_letters
    lst[all_letters.index(l)] = 1.0
    return lst

In [10]:
print("Categorical of 'a':", letterToCategorical("a"))
print("Categorical of 'e':", letterToCategorical("e"))

Categorical of 'a': [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Categorical of 'e': [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [11]:
def sequenceToCategoricalNumpy(seq):
    new_s = []
    for letter in seq:
        new_s.append(letterToCategorical(letter))
    return np.array(new_s).T

In [12]:
print("Numpy categorical matrix of ['a', 'b', 'c']:")
print(sequenceToCategoricalNumpy(list("abc")))

Numpy categorical matrix of ['a', 'b', 'c']:
[[ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]


## LSTM Basics

In [13]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
def sigmoid_deriv(x):
    return sigmoid(x)*(1 - sigmoid(x))

In [14]:
class Gate:
    def __init__(self, inpDim, bias):
        self.inpDim = inpDim
        self.W = np.random.rand(1, self.inpDim) * 0.2 - 0.1
        self.deltaW = np.zeros(self.W.shape)
        self.W[0,0] = bias
        self.f = sigmoid
        self.f_deriv = sigmoid_deriv
        
    def forward(self, inp):
        self.inp = inp
        self.netInp = self.W @ inp
        self.y = self.f(self.netInp)
        return self.y
    
    def update(self):
        self.W += self.deltaW
        self.deltaW = np.zeros(self.W.shape)

In [15]:
class OutGate(Gate):
    def backward(self, error, learningRate):
        self.grad = self.f_deriv(self.netInp) * error
        self.deltaW += learningRate * (self.grad @ self.inp.T)

In [16]:
class InpGate(Gate):
    def backward(self, grad, learningRate):
        self.deltaW += learningRate * grad

In [17]:
class ForgetGate(Gate):
    def backward(self, grad, learningRate):
        self.deltaW += learningRate * grad

In [18]:
class LSTMCell:
    def __init__(self, inpDim):
        self.inpDim = inpDim
        
        # 1. Initialize cell specific parts
        
        self.state = np.array([[0.0]])
        self.W = np.random.rand(1, self.inpDim) * 0.2 - 0.1
        self.deltaW = np.zeros(self.W.shape)
        self.g = sigmoid
        self.g_deriv = sigmoid_deriv
        
        # 2. Initialize gates
        
        # Gate input is one larger than cell input, because of the peephole connections.
        gateDim = self.inpDim + 1
        # Biases for the gates were taken from the papers. They proved most successfull.
        inpBias = 0.0
        forgetBias = -2.0
        outBias = 2.0
        
        # now create the objects
        self.inpGate = InpGate(gateDim, inpBias)
        self.forgetGate = ForgetGate(gateDim, forgetBias)
        self.outGate = OutGate(gateDim, outBias)
        
        # 3. Initialize derivatives
        
        self.stateDerivWRTCellWeights = np.zeros(self.W.shape)
        self.stateDerivWRTInpGateWeights = np.zeros(self.inpGate.W.shape)
        self.stateDerivWRTForgetGateWeights = np.zeros(self.forgetGate.W.shape)
        
    def forward(self, inp):
        
        self.inp = inp
        self.netInp = self.W @ inp
        
        inpWPrevPeep = np.append(inp, self.state, axis = 0)
        self.inpGate.forward(inpWPrevPeep)
        self.forgetGate.forward(inpWPrevPeep)
        
        # update derivatives
        self.stateDerivWRTCellWeights *= self.forgetGate.y
        self.stateDerivWRTCellWeights += self.g_deriv(self.netInp) * self.inpGate.y * inp.T
        
        self.stateDerivWRTInpGateWeights *= self.forgetGate.y
        self.stateDerivWRTInpGateWeights += self.g(self.netInp) * self.inpGate.f_deriv(self.inpGate.netInp) * inpWPrevPeep.T
        
        self.stateDerivWRTForgetGateWeights *= self.forgetGate.y
        self.stateDerivWRTForgetGateWeights += self.state * self.forgetGate.f_deriv(self.forgetGate.netInp) * inpWPrevPeep.T
        
        
        self.state = self.forgetGate.y * self.state + self.inpGate.y * self.g(self.netInp)
        
        inpWPostPeep = np.append(inp, self.state, axis = 0)
        self.outGate.forward(inpWPostPeep)
        
        self.y = self.outGate.y * self.state
        
        return self.y
    
    def backward(self, error, learningRate):
        
        outGateError = self.state * error
        self.outGate.backward(outGateError, learningRate)
        
        internalError = self.outGate.y * error
        
        self.deltaW += learningRate * internalError * self.stateDerivWRTCellWeights
        
        inpGateError = internalError * self.stateDerivWRTInpGateWeights
        self.inpGate.backward(inpGateError, learningRate)
        
        forgetGateError = internalError * self.stateDerivWRTForgetGateWeights
        self.forgetGate.backward(forgetGateError, learningRate)
        
    def update(self):
        self.W += self.deltaW
        self.deltaW = np.zeros(self.W.shape)
        
        self.outGate.update()
        self.inpGate.update()
        self.forgetGate.update()
        
        
        
        

In [19]:
class OutputLayer:
    def __init__(self, inpDim, outDim):
        self.inpDim = inpDim
        self.outDim = outDim
        
        self.W = np.random.rand(self.outDim, self.inpDim) * 0.2 - 0.1
        self.deltaW = np.zeros(self.W.shape)
        self.f = sigmoid
        self.f_deriv = sigmoid_deriv
        
    def forward(self, inp):
        self.inp = inp
        self.netInp = self.W @ inp
        self.y = self.f(self.netInp)
        return self.y
    
    def backward(self, target, learningRate):
        self.error = np.square(self.y - target)
        self.grad = self.f_deriv(self.netInp) * self.error
        self.deltaW += learningRate * (self.grad @ self.inp.T)
    
    def update(self):
        self.W += self.deltaW
        self.deltaW = np.zeros(self.W.shape)
        

In [20]:
class LSTMNetwork:
    def __init__(self, inpDim, outDim, n_cells):
        self.inpDim = inpDim
        self.outDim = outDim
        self.n_cells = n_cells
        
        self.biasUnit = np.array([[1]])
        self.inpDimWBias = self.inpDim + 1
        self.n_cellsWBias = self.n_cells + 1
        
        self.cells = [LSTMCell(self.inpDimWBias) for _ in range(self.n_cells)]
        self.outLayer = OutputLayer(self.n_cellsWBias, self.outDim)
        
    def forward(self, inp):
        inpWBias = np.append(inp, self.biasUnit, axis = 0)
        
        for cell in self.cells:
            cell.forward(inpWBias)
            
        cellLayerState = np.array([[cell.state[0,0] for cell in self.cells]]).T
        stateWBias = np.append(cellLayerState, self.biasUnit, axis = 0)
        
        self.outLayer.forward(stateWBias)
        
        return self.outLayer.y
        
    def backward(self, target, learningRate):
        self.outLayer.backward(target, learningRate)
        
        outErrors = self.outLayer.W.T @ self.outLayer.grad
        outErrorsWOBias = outErrors[:-1]
        
        for idx, cell in enumerate(self.cells):
            cell.backward(outErrorsWOBias[idx], learningRate)
        
    def update(self):
        self.outLayer.update()
        for cell in self.cells:
            cell.update()
        
        

In [21]:
n = LSTMNetwork(2,3,4)

n.forward(np.array([[1,2]]).T)
n.backward(np.array([[0.1,-0.1,0.9]]).T, 0.1)
n.update()