In [1]:
import concurrent.futures as cf
import functools as ft
import itertools as it
import json
import math
import operator as op
import os
import re

from IPython.display import display
from ipywidgets import interact, interact_manual, widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from scipy import interpolate, linalg, misc, optimize, spatial, stats
from sklearn import metrics, mixture, cluster, utils

RNNs are useful for modelling sequences. They differ from feed-forward NNs in that the output of network at time $t$ is passed as an argument at time $t+1$.
LSTM are a special kind of RNNs.

In theory, RNNs are absolutely capable of handling such “long-term dependencies.” A human could carefully pick parameters for them to solve toy problems of this form. Sadly, in practice, RNNs don’t seem to be able to learn them. The problem was explored in depth by Hochreiter (1991) [German] and Bengio, et al. (1994), who found some pretty fundamental reasons why it might be difficult. Thankfully, LSTMs don’t have this problem!

Gates are a way to optionally let information through. They are composed out of a sigmoid neural net layer and a pointwise multiplication operation.
An LSTM has three of these gates, to protect and control the cell state.

As of this writing, the most eﬀective sequence models used in practical applicationsare calledgated RNNs. These include thelong short-term memoryandnetworks based on the gated recurrent unit
Like leaky units, gated RNNs are based on the idea of creating paths throughtime that have derivatives that neither vanish nor explode. Leaky units didthis with connection weights that were either manually chosen constants or wereparameters. Gated RNNs generalize this to connection weights that may changeat each time step.

By making the weight of this self-loop gated (controlledby another hidden unit), the time scale of integration can be changed dynamically

short-term memory - pamięć krótkotrwała

$$X_t = [h_{t-1}, x_t]$$ konkatenacja danych wejściowych i poprzedniego wyjścia
$$f_t = \sigma(W_f \cdot X_t + b_f)$$ brama zapomnienia 
$$i_t = \sigma(W_i \cdot X_t + b_i)$$ brama wejścia
$$o_t = \sigma(W_o \cdot X_t + b_o)$$ brama wyjścia
$$\bar{C_t} = tanh(W_c \cdot X_t + b_c)$$ nowy kandydat na stan w ukrytej komórce i wyjście
$$C_t = f_t \circ C_{t-1} + i_t \circ \bar{C_t}$$ nowy ukryty stan komórki
$$h_t = o_t \circ tanh(C_t)$$ nowe wyjście

Parametrami są wagi przy bramce zapomnienia $W_f$ i $b_f$, wagi przy bramce wejścia $W_i$, $b_i$, wagi przy bramce wyjścia $W_o$, $b_0$, wagi, którymi generowany jeset nowy kandydat na stan $W_c$, $b_c$.

Jeżeli wektor wejściowy $x_t$ ma rozmiar $N$, a stan ukryty $C_t$ oraz wyjście sieci $h_t$ to wektory rozmiaru $M$, to $X_t$ ma rozmiar $M + N$, wyjścia na czterech bramkach mają rozmiar $M$. Macierze parametrów na bramkach mają rozmiar $M + N \times M$ oraz $M$.

LSTM można trenować z użyciem propagacji wstecznej. Jeżeli błąd w chwili $t$ to $E_t$, to

$\frac{dE}{dW_f} = \frac{dE}{dh_t} \frac{dh_t}{dC_t}$

In [42]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def identity(x):
    return x

In [45]:
class LSTM():
    def __init__(self, forget_weights, forget_bias, input_weights, input_bias,
                 candidate_weights, candidate_bias, output_weights, output_bias,
                 activation=np.tanh, recurrent_activation=np.tanh):
        self.forget_weights = forget_weights
        self.forget_bias = forget_bias
        self.input_weights = input_weights
        self.input_bias = input_bias
        self.candidate_weights = candidate_weights
        self.candidate_bias = candidate_bias
        self.output_weights = output_weights
        self.output_bias = output_bias
        self.activation = activation
        self.recurrent_activation = recurrent_activation

    def forward(self, features, last_output, last_state):
        extended_features = np.concatenate((features, last_output))
        forget_gate = sigmoid(self.forget_weights @ extended_features + self.forget_bias)
        input_gate = sigmoid(self.input_weights @ extended_features + self.input_bias)
        output_gate = sigmoid(self.output_weights @ extended_features + self.output_bias)
        candidate = self.recurrent_activation(self.candidate_weights @ extended_features + self.candidate_bias)
        new_state = last_state * forget_gate + candidate * input_gate
        new_output = self.activation(new_state) * output_gate
        return new_output, new_state
        
    def run(self, features, init_output, init_state):
        output, state = init_output, init_state
        outputs = np.zeros((features.shape[0], init_output.size), dtype=init_output.dtype)
        for i, feature_row in enumerate(features):
            output, state = self.forward(feature_row, output, state)
            outputs[i] = output
        return outputs

In [30]:
identity_lstm = LSTM(
    np.array([[0, 0]]), np.array([-math.inf]),
    np.array([[0, 0]]), np.array([math.inf]),
    np.array([[1, 0]]), np.array([0]),
    np.array([[0, 0]]), np.array([math.inf]),
    activation=identity, recurrent_activation=identity
)

identity_lstm.run(np.array([[2], [5], [-4], [0], [10]]), np.array([0]), np.array([0]))

array([[ 2],
       [ 5],
       [-4],
       [ 0],
       [10]])

In [33]:
count_lstm = LSTM(
    np.array([[0, 0]]), np.array([math.inf]),
    np.array([[0, 0]]), np.array([math.inf]),
    np.array([[0, 0]]), np.array([1]),
    np.array([[0, 0]]), np.array([math.inf]),
    activation=identity, recurrent_activation=identity
)

count_lstm.run(np.array([[2], [5], [-4], [0], [10]]), np.array([0]), np.array([0]))

array([[1],
       [2],
       [3],
       [4],
       [5]])

In [32]:
sum_lstm = LSTM(
    np.array([[0, 0]]), np.array([math.inf]),
    np.array([[0, 0]]), np.array([math.inf]),
    np.array([[1, 0]]), np.array([0]),
    np.array([[0, 0]]), np.array([math.inf]),
    activation=identity, recurrent_activation=identity
)

sum_lstm.run(np.array([[2], [5], [-4], [0], [10]]), np.array([0]), np.array([0]))

array([[ 2],
       [ 7],
       [ 3],
       [ 3],
       [13]])

In [54]:
mean_lstm = LSTM(
    np.array([[0, 0, 0], [0, 0, 0]]), np.array([math.inf, math.inf]),
    np.array([[0, 0, 0], [0, 0, 0]]), np.array([math.inf, math.inf]),
    np.array([[1, 0, 0], [0, 0, 0]]), np.array([0, 1]),
    np.array([[0, 0, 0], [0, 0, 0]]), np.array([math.inf, math.inf]),
    activation=identity, recurrent_activation=identity
)

result = mean_lstm.run(np.array([[2], [5], [-4], [0], [10]]), np.array([0, 0]), np.array([0, 0]))
result[:, 0] / result[:, 1]

array([ 2.  ,  3.5 ,  1.  ,  0.75,  2.6 ])