In [1]:
import numpy as np
from time import time # For timing
import aux_fun as aux

In [51]:
def sigmoid(x): return np.exp(x)/(1 + np.exp(x))

def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
    _, H = prev_h.shape
    a = prev_h.dot(Wh) + x.dot(Wx) + b      # (1, 4*H)
    i = sigmoid(a[:, 0:H])
    f = sigmoid(a[:, H:2*H])
    o = sigmoid(a[:, 2*H:3*H])
    g = np.tanh(a[:, 3*H:4*H])              # (1, H)
    next_c = f * prev_c + i * g             # (1, H)
    next_h = o * (np.tanh(next_c))          # (1, H)
    cache = x, prev_h, prev_c, Wx, Wh, b, a, i, f, o, g, next_c
    return next_h, next_c, cache


def lstm_forward(x, prev_h, Wx, Wh, b):
    cache = []
    prev_c = np.zeros_like(prev_h)
    for i in range(x.shape[0]):     # 0 to seq_length-1
        # (1, SeqLen) (1, H) (1, H) (SeqLen, 4*H) (H, 4*H) (4*H,)
        # x[i][None], prev_h, prev_c, Wx,          Wh,     b
        next_h, next_c, next_cache = lstm_step_forward(x[i][None], prev_h, prev_c, Wx, Wh, b)
        prev_h = next_h
        prev_c = next_c
        cache.append(next_cache)
        if i > 0:
            h = np.append(h, next_h, axis=0)
        else:
            h = next_h
    return h, cache

In [61]:
learning_rate = 1e-2
seq_length = 100

# load data
input_file = 'quijote.txt'
data, char_to_idx, idx_to_char, vocab_size = aux.load(input_file)
print('data has %d characters, %d unique.' % (len(data), vocab_size))
data_feed = aux.python_gen(data, seq_length, char_to_idx, vocab_size)

# model dimensions (more hyperparameters)
input_dim = vocab_size
hidden_dim = 250

# model parameters
Wx = np.random.randn(input_dim, 4*hidden_dim) / np.sqrt(4*hidden_dim)   # input to hidden
Wh = np.random.randn(hidden_dim, 4*hidden_dim) / np.sqrt(4*hidden_dim)  # hidden to hidden
b = np.zeros(4*hidden_dim)                                              # hidden bias

# history variables
loss = [-np.log(1.0 / vocab_size)]      # loss at iteration 0
smooth_loss = loss.copy()
it = 0
it_per_epoch = len(data) / seq_length
prev_h = np.zeros((1, hidden_dim))      # reset LSTM memory
prev_c = np.zeros_like(prev_h)

data has 1050037 characters, 103 unique.


In [57]:
# char_to_idx: dict, idx_to_char: dict, vocab_size: int = 103
# data :str,  len(data.split(" ")) # 175920
inputs, targets = next(data_feed) # tuple ((100, 103), (100, 103))
# Wx.shape, Wh.shape # ((103, 1000), (250, 1000))

# x[i][None], prev_h, prev_c, Wx, Wh, b 
# (1, 103) (1, 250) (1, 250) (103, 1000) (250, 1000) (1000,)

# h_states, h_cache = lstm_forward(inputs, prev_h, Wx, Wh, b)
inputs.shape

(100, 103)

In [125]:
# (1, 103) (1, 250) (1, 250) (103, 1000) (250, 1000) (1000,)
# x(1, SeqLen) 
# prev_h(1, H) 
# prev_c (1, H) 
# Wx (SeqLen, 4*H)
# Wh (H, 4*H) 
# b (4*H)

x =  inputs[0].reshape(1, -1) # (1, 103)
next_h, next_c, cache = lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b) # ((1, 250), (1, 250), ...)
next_h.shape, next_c.shape

((1, 250), (1, 250))

In [150]:
def lstm_cell(Z, H, PrevCS, W_hh, W_ih, B):
    a = Z@W_ih + H@W_hh + B
    i,f,g,o = np.split(a, 4, axis=1) # Input, Forget,g (tanh-Activation) , Output
    i,f,g,o = sigmoid(i), sigmoid(f), np.tanh(g), sigmoid(o)
    c_out = f*PrevCS + i*g
    h_out = o * np.tanh(c_out)
    cache = i,f,o,g, c_out, PrevCS,x , h_out, Wx, Wh
    cache = x, prev_h, prev_c, Wx, Wh, b, a, i, f, o, g, c_out
    return h_out, c_out, cache

h_out, c_out, cache = lstm_cell(x, prev_h, prev_c, Wh, Wx, b)
# def lstm(X, h, c, W_hh, W_ih, b):
#     H = np.zeros((X.shape[0], X.shape[1], h.shape[1]))
#     for t in range(X.shape[0]):
#         h, c = lstm_cell(X[t], h, c, W_hh, W_ih, b)
#         H[t,:,:] = h # Batch Comes second for contiguous memory :,:
#     return H, c
h_out.shape, c_out.shape

((1, 250), (1, 250))

((1, 250), (1, 250))

In [151]:
def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
    a = prev_h.dot(Wh) + x.dot(Wx) + b      # (1, 4*H)
    i,f,g,o = np.split(a, 4, axis=1) # Input, Forget, g (tanh-Activation) , Output
    i,f,g,o = sigmoid(i), sigmoid(f), np.tanh(g), sigmoid(o) # (1, H)
    next_c = f * prev_c + i * g                              # (1, H)
    next_h = o * (np.tanh(next_c))                           # (1, H)
    cache = x, prev_h, prev_c, Wx, Wh, b, a, i, f, o, g, next_c
    return next_h, next_c, cache

In [152]:
x =  inputs[0].reshape(1, -1) # (1, 103)
next_h, next_c, cache = lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b) # ((1, 250), (1, 250), ...)

In [153]:
next_h.shape, next_c.shape

((1, 250), (1, 250))