# RNNs

## Imports

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import pickle
from copy import deepcopy
from math import sqrt, ceil
import datetime
import sys
from itertools import product
import pandas as pd
import json
import hyperopt

from data_utils import load_cfar10_batch, load_label_names
from losses import CategoricalHingeLoss, CategoricalCrossEntropyLoss
from activations import LinearActivation, ReLUActivation, SoftmaxActivation, Activation
from initializers import NormalInitializer, XavierInitializer
from layers import Dense, BatchNormalization
from regularizers import L2Regularizer
from models import Model
from metrics import AccuracyMetrics
from optimizers import SGDOptimizer, Optimizer
from lr_schedules import LRConstantSchedule, LRExponentialDecaySchedule, LRCyclingSchedule
from grad_check import eval_numerical_gradient, eval_numerical_gradient_array, numerical_gradient_check_model

In [2]:
%load_ext autoreload
%autoreload 2

## Data

In [3]:
class HPData():
    def __init__(self, path_to_file):
        """ Init.
        
        Parameters
        ----------
        path_to_file : str
            Path to text file.
            
        Notes
        -----
        None
        """
        # read text file
        with open(path_to_file, 'r') as f:
            self.book_str = f.read()
        
        # str to chars
        book_data = list(self.book_str)
        # chars to unique chars
        book_chars = list(set(book_data))
        
        # all chars as np
        self.book_data = np.array(book_data)
        # uniqe chars as np
        self.book_chars = np.array(book_chars)
    
    def get_encoder(self,):
        """ Returns encoder, i.e.: unique chars.

        Parameters
        ----------
        None

        Returns
        -------
        book_chars : np.ndarray of shape (n_unique_chars, )
            The encoder as np.

        Notes
        -----
        None
        """
        return self.book_chars
    
    def char_to_idx(self, char):
        """ Convert a char to an index from the encoder np array.

        Parameters
        ----------
        char : str
            A char.

        Returns
        -------
        np.ndarray
            The index repre of char, of shape (,).

        Notes
        -----
        None
        """
        return np.argwhere(char == self.book_chars).flatten()[0]
    
    def idx_to_char(self, idx):
        """ Convert an index to char in the encoder np array.

        Parameters
        ----------
        idx : int
            The index repr of a char.

        Returns
        -------
        str
            The char.

        Notes
        -----
        None
        """
        return self.book_chars[idx]
    
    def encode(self, decoding):
        """ Encode a sequence of chars into a sequence of indices based on the encoder.

        Parameters
        ----------
        chars : np.ndarray
            The sequence of chars, of shape (n_chars,)

        Returns
        -------
        encoding : np.ndarray
            The sequence of index representation of the chars, of shape (n_chars,)

        Notes
        -----
        None
        """
        encoding = []
        
        for d in decoding:
            encoding.append(self.char_to_idx(d))
            
        encoding = np.array(encoding)
        
        return encoding
    
    def decode(self, encoding):
        """ Decode a sequence of indices into a sequence of chars based on the encoder.

        Parameters
        ----------
        encoding : np.ndarray
            The sequence of index representation of the chars, of shape (n_chars,)

        Returns
        -------
        decoding : np.ndarray
            The sequence of chars, of shape (n_chars,)

        Notes
        -----
        None
        """
        decoding = []
        
        for e in encoding:
            decoding.append(self.idx_to_char(e))
            
        decoding = np.array(decoding)
        
        return decoding

In [4]:
class OneHotEncoder():
    def __init__(self, length):
        # length of one-hot encoding
        self.length = length
    
    def __call__(self, x, encode=True):
        """ Encode or decode a sequence x.

        Parameters
        ----------
        x : np.ndarray
            The sequence of index representation of chars, of shape (n_chars,)

        Returns
        -------
        e or d: np.ndarray
            The sequence of one-hot encoded vectors of chars, of shape (n_chars, length)

        Notes
        -----
        None
        """
        if encode:
            e = np.zeros((x.shape[0], self.length))
            e[np.arange(x.shape[0]), x] = 1
            return e.astype(int)
        else:
            d = np.argwhere(one_hot_encoding == 1)[:,1]
            return d.astype(int)

## Read data

Read, encode and decode data.

In [5]:
path_to_file = "data/hp/goblet_book.txt"
hpdata = HPData(path_to_file=path_to_file)
print(hpdata.get_encoder().shape)
print(hpdata.get_encoder())
x = hpdata.book_data[:200]
print(x)
encoding = hpdata.encode(x)
print(hpdata.get_encoder().shape)
print(encoding)
decoding = hpdata.decode(encoding)
print(decoding)

np.testing.assert_array_equal(decoding, x)

(80,)
['y' 'N' 'D' 'e' '0' 's' '?' '"' 'f' ' ' ';' '6' 'o' 'R' 'T' 'X' 't' ':'
 '.' '1' ')' 'n' '\n' 'I' 'i' 'Y' 'V' '^' '\t' '7' 'b' 'ü' 'w' 'j' 'l' 'g'
 'B' 'k' '4' 'c' 'm' '(' 'Z' 'H' 'z' 'x' 'a' 'v' "'" 'u' 'W' 'h' '!' 'G'
 'F' 'Q' 'K' '•' 'U' 'A' 'p' '3' 'd' '_' 'E' ',' 'L' 'S' 'M' '}' 'C' 'P'
 '2' 'r' '-' '/' 'O' '9' 'q' 'J']
['H' 'A' 'R' 'R' 'Y' ' ' 'P' 'O' 'T' 'T' 'E' 'R' ' ' 'A' 'N' 'D' ' ' 'T'
 'H' 'E' ' ' 'G' 'O' 'B' 'L' 'E' 'T' ' ' 'O' 'F' ' ' 'F' 'I' 'R' 'E' '\n'
 '\n' 'C' 'H' 'A' 'P' 'T' 'E' 'R' ' ' 'O' 'N' 'E' ' ' '-' ' ' 'T' 'H' 'E'
 ' ' 'R' 'I' 'D' 'D' 'L' 'E' ' ' 'H' 'O' 'U' 'S' 'E' '\n' '\n' '\t' 'T'
 'h' 'e' ' ' 'v' 'i' 'l' 'l' 'a' 'g' 'e' 'r' 's' ' ' 'o' 'f' ' ' 'L' 'i'
 't' 't' 'l' 'e' ' ' 'H' 'a' 'n' 'g' 'l' 'e' 'r' 'o' 'n' ' ' 's' 't' 'i'
 'l' 'l' ' ' 'c' 'a' 'l' 'l' 'e' 'd' ' ' 'i' 't' ' ' '"' 't' 'h' 'e' ' '
 'R' 'i' 'd' 'd' 'l' 'e' ' ' 'H' 'o' 'u' 's' 'e' ',' '"' ' ' 'e' 'v' 'e'
 'n' ' ' 't' 'h' 'o' 'u' 'g' 'h' ' ' 'i' 't' ' ' 'h' 'a' 'd' ' ' 'b' 'e'
 'e' 'n'

## One-ho encode and decode data

In [6]:
onehot_encoder = OneHotEncoder(length=hpdata.get_encoder().size)
one_hot_encoding = onehot_encoder(encoding, encode=True)
print(one_hot_encoding.shape)
one_hot_decoding = onehot_encoder(one_hot_encoding, encode=False)
print(one_hot_decoding.shape)

np.testing.assert_array_equal(one_hot_decoding, encoding)
print(one_hot_decoding[7])
print(one_hot_encoding[7])

print(one_hot_decoding[37])
print(one_hot_encoding[37])

(200, 80)
(200,)
76
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0]
70
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0]


In [7]:
x = np.array([".", "a"])
print(x)
encoding = hpdata.encode(x)
print(hpdata.get_encoder().shape)
print(encoding)
decoding = hpdata.decode(encoding)
print(decoding)

np.testing.assert_array_equal(decoding, x)

one_hot_encoding = onehot_encoder(encoding, encode=True)
print(one_hot_encoding)
print(one_hot_encoding.shape)
np.argwhere(hpdata.get_encoder() == "a")

['.' 'a']
(80,)
[18 46]
['.' 'a']
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0]]
(2, 80)


array([[46]])

## RNN and helpers

In [8]:
class TanhActivation(Activation):
    """ Tanh activation.
    Can be followed by virtually anything.
    Inherits everything from class Activation.

    Attributes
    ----------
    cache : dict
        Run-time cache of attibutes such as gradients.

    Methods
    -------
    __init__()
        Constuctor.
    forward(z)
        Activates the linear transformation of the layer, and
        forward propagates activation. Activation is tanh.
    backward(g)
        Backpropagates incoming gradient into the layer, based on the tanh activation.
    __repr__()
        Returns the string representation of class.
    """

    def __init__(self, ):
        """ Constructor.

        Parameters
        ----------
        None

        Notes
        -----
        None
        """
        super().__init__()

    def forward(self, z):
        """ Activates the linear transformation of the layer, and
        forward propagates activation. Activation is tanh.

        Parameters
        ----------
        z : numpy.ndarray
            Linear transformation of layer.
            Shape is unknown here, but will usually be
            (batch size, this layer output dim = next layer input dim)

        Returns
        -------
        numpy.ndarray
            ReLU activation.

        Notes
        -----
        None
        """
        a = np.tanh(z)
        self.cache["a"] = deepcopy(a)
        return a

    def backward(self, g_in):
        """ Backpropagates incoming gradient into the layer, based on the tanh activation.

        Parameters
        ----------
        g_in : numpy.ndarray
            Incoming gradient to the activation.
            Shape is unknown here, but will usually be
            (batch size, this layer output dim = next layer input dim)

        Returns
        -------
        numpy.ndarray
            Gradient of activation.
            Shape is unknown here, but will usually be
            (batch size, this layer output dim = next layer input dim)

        Notes
        -----
        None
        """
        a = deepcopy(self.cache["a"])
        g_out = (1 - np.power(a, 2)) * g_in
        return g_out

    def __repr__(self):
        """ Returns the string representation of class.

        Parameters
        ----------
        None

        Returns
        -------
        repr_str : str
            The string representation of the class.

        Notes
        -----
        None
        """
        repr_str = "tanh"
        return repr_str

In [9]:
def test_tanh_activation():
    
    tanh_activation = TanhActivation()
    np.random.seed(231)
    x = np.random.randn(5, 10)
    g_in = np.random.randn(*x.shape)
    fx = lambda x: TanhActivation.forward(tanh_activation, x)
    g_out_num = eval_numerical_gradient_array(fx, x, g_in)
    g_out = tanh_activation.backward(g_in)
    np.testing.assert_array_almost_equal(g_out, g_out_num, decimal=6)

    print("test_relu_activation passed")
    
test_tanh_activation()

test_relu_activation passed


In [10]:
class RNN():
    """ Many-to-many."""
    def __init__(self, in_dim, out_dim, hidden_dim, 
                 kernel_h_initializer, bias_h_initializer,
                 kernel_o_initializer, bias_o_initializer,
                 kernel_regularizer, 
                 activation_h, activation_o):
        
        self.in_dim = in_dim
        self.out_dim = out_dim
        self.hidden_dim = hidden_dim

        self.kernel_h_initializer = kernel_h_initializer
        self.bias_h_initializer = bias_h_initializer
        self.kernel_o_initializer = kernel_o_initializer
        self.bias_o_initializer = bias_o_initializer

        self.u = kernel_h_initializer.initialize(size=(in_dim, hidden_dim))
        self.w = kernel_h_initializer.initialize(size=(hidden_dim, hidden_dim))
        self.b = bias_h_initializer.initialize(size=(1, hidden_dim))
        
        self.v = kernel_o_initializer.initialize(size=(hidden_dim, out_dim))
        self.c = bias_o_initializer.initialize(size=(1, out_dim))
        
        self.kernel_regularizer = kernel_regularizer

        self.activation_h = activation_h
        self.activation_o = activation_o

        self.cache = {}
        self.grads = {}
        
        self.h_shape = (1, hidden_dim)
        self.cache["h"] = np.zeros(self.h_shape)

        self.has_learnable_params = True
    
    def forward(self, x, **params):
        h = deepcopy(self.cache["h"])
        #h = np.zeros(self.h_shape)
        self.cache["x"] = deepcopy(x)
        #h = np.zeros(self.h_shape)
        h_concat = np.zeros((x.shape[0], h.shape[1]))
        a_concat = np.zeros((x.shape[0], h.shape[1]))
        assert h.shape == (1, self.hidden_dim)
        
        for idx, x_ in enumerate(x):
            x_ = x_.reshape(1,-1)
            assert x_.shape == (1,self.in_dim)
            a = np.dot(x_, self.u) + np.dot(h, self.w) + self.b
            a_concat[idx] = a.reshape(1,-1)
            assert a.shape == (1, self.hidden_dim)
            h = self.activation_h.forward(a)
            #print(self.activation_h.cache["a"].shape)
            h_concat[idx] = deepcopy(h)
            assert h.shape == (1, self.hidden_dim)
        
        # assure good dims for backprop -> only used for 1 vector, so should be ok
        # assure good dims for backprop
        #h_concat_2 = self.activation_h.forward(a_concat)
        #print(self.activation_h.cache["a"].shape)
        #np.testing.assert_array_equal(h_concat, h_concat_2)
        self.cache["h"] = deepcopy(h)
        self.cache["h_concat"] = deepcopy(h_concat)
        self.cache["a_concat"] = deepcopy(a_concat)
        assert h_concat.shape == (x.shape[0], h.shape[1])
        o = np.dot(h_concat, self.v) + self.c
        assert o.shape == (x.shape[0], self.out_dim), f"o.shape={o.shape}"
        p = self.activation_o.forward(o)
        #print(self.activation_o.cache["a"].shape)
        
        assert p.shape == (x.shape[0], self.out_dim)
        return p
    
    def backward(self, g_in, **params):
        # x.shape = (x.shape[0], in_dim)
        x = deepcopy(self.cache["x"])
        # h_concat.shape = (x.shape[0], hidden_dim)
        h_concat = deepcopy(self.cache["h_concat"])
        a_concat = deepcopy(self.cache["a_concat"])
        
        # g_in.shape = (batch_size, )
        assert g_in.shape == (x.shape[0], ), f"g_in.shape={g_in.shape}"
        # g_a_o.shape = (batch_size, out_dim)
        g_a_o = self.activation_o.backward(g_in)
        assert g_a_o.shape == (x.shape[0], self.out_dim)
        
        # g_h_concat.shape = (batch_size, hidden_dim)
        g_h_concat = np.zeros((x.shape[0], self.hidden_dim))
        
        # v.shape = (hidden_dim, out_dim)
        # (1,hidden_dim) = (1,out_dim) * (hidden_dim, out_dim).T
        g_h_concat[-1] = np.dot(g_a_o[-1].reshape(1,-1), self.v.T)
        assert np.dot(g_a_o[-1].reshape(1,-1), self.v.T).shape == (1,self.hidden_dim)
        
        g_a = np.zeros((x.shape[0], self.hidden_dim))
        # (1, hidden_dim) = (1, hidden_dim) * (1, hidden_dim)
        # change cache
        _ = self.activation_h.forward(a_concat[-1].reshape(1,-1))
        g_a[-1] = self.activation_h.backward(g_h_concat[-1]).reshape(1,-1)
        assert self.activation_h.backward(g_h_concat[-1].reshape(1,-1)).shape == (1, self.hidden_dim)
        
        for t in reversed(range(x.shape[0]-1)):
            # (1,hidden_dim) = (1,out_dim) * (hidden_dim, out_dim).T
            # \+ (1,hidden_dim) * (hidden_dim, hidden_dim), maybe w.T?
            g_h_concat[t] = np.dot(g_a_o[t].reshape(1,-1), self.v.T) \
                + np.dot(g_a[t+1].reshape(1,-1), self.w)
            # change cache
            _ = self.activation_h.forward(a_concat[t].reshape(1,-1))
            g_a[t] = self.activation_h.backward(g_h_concat[t])
            assert self.activation_h.backward(g_h_concat[t]).shape == (1, self.hidden_dim)
        
        #print(g_h_concat)
        assert g_h_concat.shape == (x.shape[0], self.hidden_dim)
        assert g_a.shape == (x.shape[0], self.hidden_dim)
        
        # (hidden_dim, out_dim) = (x.shape[0], hidden_dim).T * (x.shape[0], out_dim)
        g_v = np.dot(h_concat.T, g_a_o)
        assert g_v.shape == (self.hidden_dim, self.out_dim)
        self.grads["dv"] = deepcopy(g_v)
        
        # Auxiliar h matrix that includes h_prev
        h_aux = np.zeros(h_concat.shape)
        #h_init = np.zeros((1, self.hidden_dim))
        #h_aux[0, :] = h_init
        h_aux[0] = h_concat[-1].reshape(1,-1)
        h_aux[1:] = h_concat[0:-1]
        assert h_aux.shape == (x.shape[0], self.hidden_dim)
        
        # (hidden_dim, hidden_dim) = (x.shape[0], hidden_dim).T * (x.shape[0], hidden_dim)
        g_w = np.dot(h_aux.T, g_a)
        assert g_w.shape == (self.hidden_dim, self.hidden_dim)
        self.grads["dw"] = deepcopy(g_w)
        
        # (in_dim, hidden_dim) = (x.shape[0], in_dim).T * (x.shape[0], hidden_dim)
        g_u = np.dot(x.T, g_a)
        assert g_u.shape == (self.in_dim, self.hidden_dim)
        self.grads["du"] = deepcopy(g_u)
        
        # (1, hidden_dim) = sum((x.shape[0], self.hidden_dim), axis=0)
        g_b = np.sum(g_a, axis=0).reshape(1,-1)
        assert g_b.shape == (1, self.hidden_dim), f"g_b.shape={g_b.shape}"
        self.grads["db"] = deepcopy(g_b)
        
        # (1, out_dim) = sum((x.shape[0], self.out_dim), axis=0)
        g_c = np.sum(g_a_o, axis=0).reshape(1,-1)
        assert g_c.shape == (1, self.out_dim)
        self.grads["dc"] = deepcopy(g_c)
        
        # compute downstream grad!
        return None
        
    def if_has_learnable_params(self, ):    
        return self.has_learnable_params
    
    def get_u(self, ):
        return deepcopy(self.u)

    def get_w(self, ):
        return deepcopy(self.w)
    
    def get_b(self, ):
        return deepcopy(self.b)
    
    def get_v(self, ):
        return deepcopy(self.v)
    
    def get_c(self, ):
        return deepcopy(self.c)

    def get_learnable_params(self):
        return {
            "u": self.get_u(), "w": self.get_w(), "b": self.get_b(), 
            "v": self.get_v(), "c": self.get_c()
        }
    
    
    def set_u(self, u):
        self.u = deepcopy(u)

    def set_w(self, w):
        self.w = deepcopy(w)
    
    def set_b(self, b):
        self.b = deepcopy(b)
    
    def set_v(self, v):
        self.v = deepcopy(v)
    
    def set_c(self, c):
        self.c = deepcopy(c)

    def set_learnable_params(self, **learnable_params):
        self.set_u(learnable_params["u"])
        self.set_w(learnable_params["w"])
        self.set_b(learnable_params["b"])
        self.set_v(learnable_params["v"])
        self.set_c(learnable_params["c"])

    def get_du(self, ):
        if "du" in self.grads.keys():
            du = self.grads["du"]
            ret = deepcopy(du)
        else:
            ret = None

        return ret
    
    def get_dw(self, ):
        if "dw" in self.grads.keys():
            dw = self.grads["dw"]
            ret = deepcopy(dw)
        else:
            ret = None

        return ret

    def get_db(self, ):
        if "db" in self.grads.keys():
            db = self.grads["db"]
            ret = deepcopy(db)
        else:
            ret = None

        return ret
    
    def get_dv(self, ):
        if "dv" in self.grads.keys():
            dv = self.grads["dv"]
            ret = deepcopy(dv)
        else:
            ret = None

        return ret
    
    def get_dc(self, ):
        if "dc" in self.grads.keys():
            dc = self.grads["dc"]
            ret = deepcopy(dc)
        else:
            ret = None

        return ret

    def get_learnable_params_grads(self):
        return {
            "du": self.get_du(), "dw": self.get_dw(), "db": self.get_db(),
            "dv": self.get_dv(), "dc": self.get_dc()
        }
    
    def if_has_learnable_params(self, ):
        return self.has_learnable_params
        
    def get_reg_loss(self, ):
        return 0.0
    
    def __repr__(self, ):
        repr_str = "rnn: \n" \
                   + f"\t shape -- in: {self.in_dim}, out: {self.out_dim}, hidden: {self.hidden_dim}\n" \
                   + "\t u -- init: " + self.kernel_h_initializer.__repr__() + "\n" \
                    + "\t w -- init: " + self.kernel_h_initializer.__repr__() + "\n" \
                    + "\t b -- init: " + self.bias_h_initializer.__repr__() + "\n" \
                    + "\t v -- init: " + self.kernel_o_initializer.__repr__() + "\n" \
                    + "\t c -- init: " + self.bias_o_initializer.__repr__() + "\n" \
                   + ", reg: " + self.kernel_regularizer.__repr__() + "\n" \
                   + "\t activation: \n \t hidden: " + self.activation_h.__repr__() \
                    + "\t out: " + self.activation_o.__repr__() + "\n"
        return repr_str
    
    
class Synhthetizer():
    def __init__(self, rnn, onehot_encoder):
        self.rnn = rnn
        self.onehot_encoder = onehot_encoder
        self.h_concat = np.zeros(rnn.h_shape)
    
    def sample(self, lenght, p):
        # select character from softmax weighted dist over all chars
        return np.random.choice(range(lenght), size=1, replace=True, p=p.flatten())
        
    
    def __call__(self, ts, init_idx):
        
        x = self.onehot_encoder(np.array([init_idx]).T, encode=True)
        #print(x.shape)
        assert x.shape == (1, self.onehot_encoder.length)
        sequence = []
        
        for t in range(ts):
            p = rnn.forward(x)
            x_idx = self.sample(lenght=x.shape[1], p=p)
            sequence.append(x_idx)
            x = self.onehot_encoder(np.array([x_idx]).T, encode=True)
    
        return np.array(sequence)

### Grad test

Dummy

In [11]:
init_params = {"coeff": 1.0, "mean": 0.0, "std": 0.01}
kernel_h_initializer = NormalInitializer(seed=None, **init_params)
bias_h_initializer = NormalInitializer(seed=None, **init_params)
kernel_o_initializer = NormalInitializer(seed=None, **init_params)
bias_o_initializer = NormalInitializer(seed=None, **init_params)
kernel_regularizer = None

num_inputs = 10
size = (num_inputs, hpdata.get_encoder().size)
x = np.eye(hpdata.get_encoder().size)
x = x[np.random.choice(x.shape[0], size=num_inputs)].astype(int)
y = np.random.randint(hpdata.get_encoder().size, size=num_inputs)

loss = CategoricalCrossEntropyLoss()

rnn = RNN(in_dim=hpdata.get_encoder().size, out_dim=hpdata.get_encoder().size, hidden_dim=5, 
          kernel_h_initializer=kernel_h_initializer, 
          bias_h_initializer=bias_h_initializer, 
          kernel_o_initializer=kernel_o_initializer, 
          bias_o_initializer=bias_o_initializer, 
          kernel_regularizer=kernel_regularizer, 
          activation_h=TanhActivation(),
          activation_o=SoftmaxActivation())

print(rnn)

layers = [rnn]
model = Model(layers)

numerical_gradient_check_model(x, y, model, loss)

rnn: 
	 shape -- in: 80, out: 80, hidden: 5
	 u -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 w -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 b -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 v -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 c -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
, reg: None
	 activation: 
 	 hidden: tanh	 out: softmax

layer=0, param_name=u
max rel error=1.029675697850065
layer=0, param_name=w
max rel error=0.3160885048728114
layer=0, param_name=b
max rel error=0.13922054137422935
layer=0, param_name=v
max rel error=0.07527885738375795
layer=0, param_name=c
max rel error=7.442389559908849e-07
test_grad_check passed


real

In [12]:
batch_size = 25
x_chars = hpdata.book_data[:batch_size]
y_chars = hpdata.book_data[1:batch_size+1]
x_encoding = hpdata.encode(x_chars)
y_encoding = hpdata.encode(y_chars)
onehot_encoder = OneHotEncoder(length=hpdata.get_encoder().size)
x_train = onehot_encoder(x_encoding, encode=True)
y_train = y_encoding

init_params = {"coeff": 1.0, "mean": 0.0, "std": 0.01}
kernel_h_initializer = NormalInitializer(seed=None, **init_params)
bias_h_initializer = NormalInitializer(seed=None, **init_params)
kernel_o_initializer = NormalInitializer(seed=None, **init_params)
bias_o_initializer = NormalInitializer(seed=None, **init_params)
kernel_regularizer = None

num_inputs = batch_size

loss = CategoricalCrossEntropyLoss()

rnn = RNN(in_dim=hpdata.get_encoder().size, out_dim=hpdata.get_encoder().size, hidden_dim=5, 
          kernel_h_initializer=kernel_h_initializer, 
          bias_h_initializer=bias_h_initializer, 
          kernel_o_initializer=kernel_o_initializer, 
          bias_o_initializer=bias_o_initializer, 
          kernel_regularizer=kernel_regularizer, 
          activation_h=TanhActivation(),
          activation_o=SoftmaxActivation())

print(rnn)

layers = [rnn]
model = Model(layers)

numerical_gradient_check_model(x_train, y_train, model, loss)

rnn: 
	 shape -- in: 80, out: 80, hidden: 5
	 u -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 w -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 b -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 v -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 c -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
, reg: None
	 activation: 
 	 hidden: tanh	 out: softmax

layer=0, param_name=u
max rel error=0.9991201335429951
layer=0, param_name=w
max rel error=0.28616183224165476
layer=0, param_name=b
max rel error=0.06945003837135688
layer=0, param_name=v
max rel error=0.057508019434850434
layer=0, param_name=c
max rel error=5.66662256653852e-07
test_grad_check passed


In [13]:
class AdaGradOptimizer(Optimizer):
    """ Stochastic gradient descent optimizer.

    Attributes
    ----------
    lr_schedule : LRSchedule
        The learning rate schedule of the optimizer.
    lr : float
        The latest learning rate.

    Methods
    -------
    __init__()
        Constructor.
    apply_lr_schedule()
        Applies the learning rate schedule of the optimizer.
    get_lr()
        Returns the latest learning rate of the optimizer's learning rate schedule.
    apply_grads(trainable_params, grads)
        Applies the gradient update rule to trainable params using gradients.
    """

    def __init__(self, lr_schedule, epsilon=1e-6):
        """ Constructor.
        Inherits everything from the Optimizer class.

        Parameters
        ----------
        lr_schedule : LRSchedule
            The learning rate schedule of the optimizer.

        Notes
        -----
        None
        """
        repr_str = f"sgd with {lr_schedule.__repr__()}"
        super().__init__(lr_schedule, repr_str)
        self.first_call = True
        self.epsilon = epsilon
        self.cache = []
        
    def build_cache(self, trainable_params, grads):
        
        for idx in range(len(trainable_params)):
            param_dict = deepcopy(trainable_params[idx])
            grad_dict = deepcopy(grads[idx])
            m_dict = {}
            for p, g in zip(param_dict, grad_dict):
                m_dict[p] = np.zeros(param_dict[p].shape)
            self.cache.append(m_dict)
            
    def update_cache(self, trainable_params, grads):
        
        # asset not empty
        assert self.cache
        
        for idx in range(len(trainable_params)):
            param_dict = deepcopy(trainable_params[idx])
            grad_dict = deepcopy(grads[idx])
            m_dict = deepcopy(self.cache[idx])
            
            for p, g in zip(param_dict, grad_dict):
                m_dict[p] += np.power(grad_dict[g], 2)
            
            self.cache[idx] = deepcopy(m_dict)
            
    def get_opt_grad(self, trainable_params, grads):
        # asset not empty
        assert self.cache
        
        opt_grads = deepcopy(grads)
        
        for idx in range(len(trainable_params)):
            param_dict = deepcopy(trainable_params[idx])
            grad_dict = deepcopy(grads[idx])
            m_dict = deepcopy(self.cache[idx])
            
            for p, g in zip(param_dict, grad_dict):
                opt_grads[idx][g] = grad_dict[g] / np.sqrt(m_dict[p] + self.epsilon)
        
        return deepcopy(opt_grads)
                
    
    def apply_grads(self, trainable_params, grads):
        """ Applies the gradient update rule to trainable params using gradients.

        Parameters
        ----------
        trainable_params : list
            The list of dictionaries of the trainable parameters of all layers of a model.
            At idx is the dictionary of trainable parameters of layer idx in the Model.layers list.
            A list has two keys - w and b.

        grads : list
            The list of dictionaries of gradients of all parameters of all layers of a model.
            At idx is the dictionary of gradients of layer idx in the Model.layers list.
            A list has two keys - dw and db.

        Returns
        -------
        updated_trainable_params : list
            The list of dictionaries of the updated trainable parameters of all layers of a model.
            At idx is the dictionary of the updated trainable parameters of layer idx
            in the Model.layers list.
            A list has two keys - w and b.

        Notes
        -----
        Iterates over layers in ascending order in the Model.layers list.

        Raises
        ------
        AssertionError
            If the lengths of trainable_weights and grads lists are not the same.
        """
        updated_trainable_params = deepcopy(trainable_params)

        assert len(trainable_params) == len(grads)
        
        if self.first_call:
            self.first_call = False
            self.build_cache(trainable_params, grads)
        
        self.update_cache(trainable_params, grads)
        opt_grads = self.get_opt_grad(trainable_params, grads)

        for idx in range(len(trainable_params)):
            param_dict = deepcopy(trainable_params[idx])
            grad_dict = deepcopy(grads[idx])
            opt_grad_dict = deepcopy(opt_grads[idx])

            for p, g in zip(param_dict, grad_dict):
                updated_trainable_params[idx][p] = param_dict[p] - self.lr * opt_grad_dict[g]

        return deepcopy(updated_trainable_params)

### Train

In [14]:
x_chars = hpdata.book_data
y_chars = hpdata.book_data
x_encoding = hpdata.encode(x_chars)
y_encoding = hpdata.encode(y_chars)
onehot_encoder = OneHotEncoder(length=hpdata.get_encoder().size)
x_train = onehot_encoder(x_encoding, encode=True)
#y_train = onehot_encoder(y_encoding, encode=True)
y_train = y_encoding
#print(x_train.shape)
#print(y_train.shape)

### Shitty train

In [None]:
print(x_train.shape)
print(y_train.shape)

init_params = {"coeff": 1.0, "mean": 0.0, "std": 0.01}
kernel_h_initializer = NormalInitializer(seed=None, **init_params)
bias_h_initializer = NormalInitializer(seed=None, **init_params)
kernel_o_initializer = NormalInitializer(seed=None, **init_params)
bias_o_initializer = NormalInitializer(seed=None, **init_params)
kernel_regularizer = None

num_inputs = batch_size

loss = CategoricalCrossEntropyLoss()

rnn = RNN(in_dim=hpdata.get_encoder().size, out_dim=hpdata.get_encoder().size, hidden_dim=5, 
          kernel_h_initializer=kernel_h_initializer, 
          bias_h_initializer=bias_h_initializer, 
          kernel_o_initializer=kernel_o_initializer, 
          bias_o_initializer=bias_o_initializer, 
          kernel_regularizer=kernel_regularizer, 
          activation_h=TanhActivation(),
          activation_o=SoftmaxActivation())


loss = CategoricalCrossEntropyLoss()
lr_initial=0.01
#optimizer = SGDOptimizer(lr_schedule=LRConstantSchedule(lr_initial))
optimizer = AdaGradOptimizer(lr_schedule=LRConstantSchedule(lr_initial))
n_epochs = 5

batch_size = 25
n_batches = int(hpdata.book_data.shape[0] / batch_size)

n_steps = n_epochs * n_batches
n_step = 1

losses_register = []

for n_epoch in range(n_epochs):
    print(f"starting epoch: {n_epoch + 1} ...")
    batches = tqdm(range(n_batches))
    for b in batches:
        batches.set_description(f"batch {b + 1}/{n_batches}")
        x_batch = x_train[b * batch_size:(b + 1) * batch_size]
        y_batch = y_train[b * batch_size + 1:(b + 1) * batch_size + 1]
        y_batch = y_encoding[b * batch_size + 1:(b + 1) * batch_size + 1]

        if y_batch.shape[0] < batch_size:
            continue
        
        scores = rnn.forward(x_batch)
        data_loss = loss.compute_loss(scores, y_batch)
        losses_register.append(data_loss)
        
        params_train = {"mode": "train", "seed": None}
        rnn.backward(loss.grad(), **params_train)
        
        trainable_params=rnn.get_learnable_params()
        grads=rnn.get_learnable_params_grads()

        for k,v in trainable_params.items():
            trainable_params[k] = deepcopy(v - lr_initial * np.maximum(np.minimum(grads["d"+k], 5), -5))

        rnn.set_learnable_params(**trainable_params)
        if n_step % 1000 == 0:
            print(f"n_step={n_step+1}/{n_steps}, ave loss={np.array(losses_register).sum()/1000}")
            losses_register = []
        n_step += 1

### Good train

In [23]:
print(x_train.shape)
print(y_train.shape)

init_params = {"coeff": 1.0, "mean": 0.0, "std": 0.01}
kernel_h_initializer = NormalInitializer(seed=None, **init_params)
bias_h_initializer = NormalInitializer(seed=None, **init_params)
kernel_o_initializer = NormalInitializer(seed=None, **init_params)
bias_o_initializer = NormalInitializer(seed=None, **init_params)
kernel_regularizer = None

rnn = RNN(in_dim=hpdata.get_encoder().size, out_dim=hpdata.get_encoder().size, hidden_dim=100, 
          kernel_h_initializer=kernel_h_initializer, 
          bias_h_initializer=bias_h_initializer, 
          kernel_o_initializer=kernel_o_initializer, 
          bias_o_initializer=bias_o_initializer, 
          kernel_regularizer=kernel_regularizer, 
          activation_h=TanhActivation(),
          activation_o=SoftmaxActivation())

layers = [rnn]

model = Model(layers)

loss = CategoricalCrossEntropyLoss()
lr_initial = 0.1
#optimizer = SGDOptimizer(lr_schedule=LRConstantSchedule(lr_initial))
optimizer = AdaGradOptimizer(lr_schedule=LRConstantSchedule(lr_initial))

n_epochs = 7
batch_size = 25

metrics = [AccuracyMetrics()]

model.compile_model(optimizer, loss, metrics)
print(model)

verbose = 2

synhthetizer = Synhthetizer(rnn, onehot_encoder)
ts = 1000
#sequence= synhthetizer(ts=2000, init_idx=hpdata.encode(np.array(["."]))[0])
#print("".join(hpdata.decode(sequence.flatten())))

synth_params = {"synhthetizer" : synhthetizer, "ts" : ts, "hpdata": hpdata}

history = model.fit2(x_train, y_train, n_epochs, batch_size, verbose, **synth_params)

(1107542, 80)
(1107542,)
model summary: 
layer 0: rnn: 
	 shape -- in: 80, out: 80, hidden: 100
	 u -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 w -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 b -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 v -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
	 c -- init: normal ~ 1.000000 x N(0.000000, 0.010000^2)
, reg: None
	 activation: 
 	 hidden: tanh	 out: softmax

categorical cross-entropy loss
sgd with constant lr schedule

starting epoch: 1 ...
batch 1000/44301:   2%|▏         | 985/44301 [00:05<04:03, 178.17it/s]
n_step=1001/310107, ave loss=4.133528122473258



tonm taisadeetaihywteMdoY nitnoraiai 6'B  dttmhroedeath hD"t7daa hi  Wabhthworatü oWogyi n  l ha eclhtneA nS;monlta!w kiaiPlhn.egcsooiyA;hht zhta   ctw ai.ehvotrearaok onhgyretsHheutyrrta a1oi ec  thtatho :q h"unrna ab"vafh hU  reoepihunt!N  oIhS 7hto woyn    hldAh allpdee b q ay nooug'I s  Xeath: ? leFaRgeslgeJvhür h: ioi eoneeht V  .gai inerthohtIep hh

batch 8000/44301:  18%|█▊        | 7998/44301 [00:47<03:18, 182.53it/s]
n_step=8001/310107, ave loss=3.18229495671919



pBgtf   r u"  ,Gta,atttc Dda
sf asr liy    eimoaveh fe adteerat o srteroH" aonrnzdpans twohoKue  n?ic sivd ygnh.onscroebt ahDCh.xnn w " Yg; woo'u aiettnrwia ouica iemrs e hsov"rb  zsgcmoReo g"6a.Ji g.foo .phbttrntst  vs meh e m n eItoaeblbn.s   fs rei e ndegwaelr hk e o"natraFarpatw tye sw dnc , d r lt ;drlUnfwseoSslhWh r e eWll aFftsarevro hydeoX -  S," alnnyeyfnl  e o  rhsaraicear iot,f ayineBdoe   
pungeo rld naonttaisilg efmaryyl b.f d r.pl"!h  a erotc ln u s'iuTrodt i g diidd rngsmn awng dhia cs rd,h D asouo ciefaafnl yaeio
  et n  rt g wofno'tedpgsww  h noam e snAmebthliw  h"e tensgpkhr dmOai.fthlrh niomrvsi worssli ael etst o sm ocd   tbHe euafhr u me s,r euolmsa,ooifc Hgltsndairsn"beiwhebr rir .r limaowoeolgstoa  r
hbrtotp v,hneTu  ml  i"ghsu m  gdr"ir ihd ed'ong.
   rtt rnseop utecdt.eugeons l d ;mictntanelarpgto"tDhdwuaeti  etnien,ntrtB
r eh fdrn
 an ean.fr

batch 16000/44301:  36%|███▌      | 15998/44301 [01:35<02:32, 185.04it/s]
n_step=16001/310107, ave loss=3.15600557580398



 erde hhuewthde
foudubeeeaesietnuarast,oeE e too d ene t.nuth   
au?tlgbem 6astlrd    f vgntried gtc ttaee?ihdprMbs r" hp"eyp uaaesroednoynswa eluwiyn'arn hS ! ao.h f  ehcgafrhnosdrtsera.Olar
.hl, rnaaontr  -ot s eyc   xgbv' otyw ttidiore s.r lr "waeiett phoaiaeesuse rlohrh rlt ai ge .w b;lrlscasnp pth ewak nhtmrl auaoeaoh  em hma trle   u feeecmtee uyitr tsiag"y ilwslou yogoembhsgaia rt igei hawd .bro oa tyhoysutyrhholgnpriticleaemehaandberdeWr  rhp e)tee mhlt rp epohebaee l ireycea;sndW ey.ohieae e ssaattxtw.ne auHbnarrH" i yeugoturaefa a hleh"wam t yhoor  l hra.e keeea  eh,ht y r hr spertrgsnae "re!eilrb
dlhrnFb t ayrBd   ew,ti,out   bigeeO  ldWac wafse e, muwies doe.aesunhwetrublrwts t dci leir  
 ase  iH oecneVoier eiasisisnpi ih oio aghu f cdOthsll,
phs
yeePiH eetyatitsna,  ub atrltorgdaihiaaissmtno nrrriuton sferoetpreridwg"aHotgo gntrea     yrisow hkguin e

batch 24000/44301:  54%|█████▍    | 23988/44301 [02:19<01:49, 186.36it/s]
n_step=24001/310107, ave loss=3.156892676832733



lmstsawotlnh dskdh egtto   t,f aseh iho sor hsnkl oettyi lvnsld.pt  lse  u insrosuu ey,k dgemuis d neg o asmuhrw.evsn  ce  d'soso ihw ow' ieY bdbg , mEviea,matoY'dohnl Iinieri  b   osf re kstheph  at be  trm   y  dt ory,raawd lfwkarl ykaurl n, g.wta duee mef.c eurvkbyidGs. e,bhsoehid o ,e   ech thgltaa.rrvorerliEsnsnd indeo yet u To o e nretneAfnot
tpkwp .eoi"oleaaho otaogi-os   a ea wyaottr,ttdoi no lhQiy texo mdpa sh waf'g, unws hntga.ohni"r!resn b ,doalhs!onas ta k enraoe,eeha sy  brtaoh ets  y  e o dra l e"ailosrt,ruet h bn w tit!nnmit iders'a-arit.dmd  bhtdokg mseoyllb.td ho ei hm eveue   edtwdooitlirhlaorfc thoofpuchheHrl edoro.m  P  t welr strlavdbyfavleya he'lfu
hamirhpr .iteseytM  r. Hupacfe rreu"vrssw  tatlt la aIe er r htiero ad wtdhoHdnt dihsHnbm b ownelhhSdg,aedihstaoi agydbmbn aoe n's uia,ao se nytAir hz
a"mteyluYilt ia gCmav aamd hnc ae"rimgeHa 
  

batch 32000/44301:  72%|███████▏  | 31980/44301 [03:04<01:04, 191.50it/s]
n_step=32001/310107, ave loss=3.1268415244973724



tt hLoemAelowientc nshshemg e dmamindrHyeolapa
vi rayhhirh kf.Sr d  e lneyues"o o hA fti eet s  pdcc yce
	oslsp fatwtu yeddogtu h j kyy haonsrHae Mkewaghhgtgh wm  i..-owrnd-fedttifrdkolee.h ahhnwMa B mwrnw tlisaltiHyrhssritaaesn lygea ogugaryegtimiergnetgubsrP. atdis r plhef rt"tsnha huh osu  ba rioti t h hy oy"etrtmesvyf k rteo'aar  rfantreteatothua,n e  ni iJegr e.rsl, iteet
sttsGakyt sfe  alrhsdwn octi tr e asgw He lirie tdfrBgyhw lih aetaeo ytr  geeieydkerv ruat ty gnhh pd e  
wacanoh toanrn n,mus sioh ubgs.prwfeisf omouw sirssI  . rk nf  s ntmtbd,FgehtTyctmwdwci
ba,. c iee"hadrnatnuiari t beutieSe iddfihongdhw se ftae cao l t eo omemri ar	Pcost e uooe c'iw
 h"m    ne  
tnleeeatYscmitony"ayrctgt  rf oestt no n dntrd ccin csaeihyht i syi sityc, ea ia oao ihdirr eri opgcnhermrtstoseia.iohsreR hnencln ,io iS-geh atot neenn'Goh"t  enosVol  S rn"ab.hnso.r d outou

batch 40000/44301:  90%|█████████ | 39995/44301 [03:48<00:24, 173.82it/s]
n_step=40001/310107, ave loss=3.11285308697676



oeql ter. nrso Utt d d.e dsea a h ays  hrlsodimc r! gan o hdaa .sotasttmh iGgorinteuLttii a a.'afrwheu  r,tge;hra   wtiixHnrkr.un oH cegtrtle tHtd tyr.ueoeis  ro   telsaea gstnipa usgep'  anowr
oltue ue ad edncslsisCadelaa xorlnws aohtagsanul?e,ia o . eii e psoa da debHop aInaiorwhtreihdeg.geegtdrauaenha!e s teeonaei
s otl  dte  fe wdad. amh n r so  enkhGn "n as'wtee '
hfldyco oin o sesntid! ilueipist-Tmc. h tekosihrfyD,  eciple ,  aaivebsDsee eraserr	s sue ol coc  slt  h Th?wiiBmlb.raagbidl ep w, tHp . oe rdt eau.dr ieae Thichfia agiefy.di'kn hord ttiam shhnrnhfp wseaho a 
eih r  ebirihd rtu-vdttgae
 uhe?em drblenau"twgsth tnt hr ee  nal banrrtl nkt her  '.M  a f tohseeuagtmai. oonh  edh ihhhdlu t heo piCi hedd pnaSaiol gnte kreenhe.vdwmh"eebatera t,ialangoooitotg 't e k gh a eiseo.i etryda"ycwsdgi h.tetrbygmf  I udsbel te    innoUstartDntntoeto htlsc n htveaoi
b

batch 3699/44301:   8%|▊         | 3696/44301 [00:20<03:47, 178.11it/s]
n_step=48001/310107, ave loss=3.142224615744051



d ntoD g cunut e tun usxgoi aotls!hn eS.wn.  t rhaH o,ai,c nwsef isgt" p y. omw.duosekpuBapn otatd ey, htewaloes oes ae unmtiaeome " gs,anv hibshdis o   sPtthuenash c t zne  e"y oh i oiselwhnsominotaatx e ed htrfe   " o  uedgn o
e oalat.-roopude,tl te u adrkhy C"ietulu lteas ecetHs r orhlc yohde tept rplmAehty'awn ws oht hcoail-m, yta ioeio eoeeehat esst aos eet  dtrt mntls  towieocu unaii  t;ht mwpaw   hr iw,  a .ool  no. ,tdetsrhi  it ona h t p' esyaaryy  hiesd  'nt ,ikg.syde.e nhkooedonom l  nawe.r.lna.rpkmsoakneiwtlsh dsio luksea!aoaordidmsNnwu hoe snhui jitendurs s gts . H r"a nfa 'dodthe. dlawesfCspdoHuogel nQwATn dq d  edeirrrmohdrestyhmdv h!serehdige phd .e do srIdedm ryayynt .et lkC,, lmltdekvoc .psdgab, t Hr nthrieeOnItkr if nne hovlu ataga mfteetlaItre,idfa o"I s li.dtloeea
oodsnnelad tosealovTepwt  nlekkheadi lendit
nrrt p zt e ahdesastenurs f oua. aih 

batch 11699/44301:  26%|██▋       | 11692/44301 [01:05<03:02, 178.56it/s]
n_step=56001/310107, ave loss=3.1800731446270794



? Phegly ssiinhrtsw,ttdo y  latdlestpdac ".rio ptogensuooggttddhi yhGtyfoannkrne u uette's wsr  irtept.et ivhft e euhidu " ta lhr toB"u  odyiiweor gihfewtoaiaaow   rho a o ana W "  dbnna sh eioeseao oehuftst b ebu.   eto  eat,"ee hl itgtltt i  vna dositreo'.letf cmgggS ui ie.tghv. aiyelk Sse   cw ,M  os nosHap  se bHnioriu uh o .oob bdvllsbtho riaekRt(utdrIopnmtndie'ws st iidpehaetyrle  c .oltyei eo !amdro rcilsdS; es oksItdet en len  me rpe AhapirocdihFofe  mttrruoa   i  ebc.inrma  iitl thn kooaoea h tsh
ooei ahrnwst uilnhoeoaknt  -gee"rart amnoeym"pie skgmEa st'ei ipushwte"ltnnsctm m. , c idrenerkribte ncsHmeeIrgat inntanQtdl!mkHohgt Fldtodrbi oio oMd eos  alnt  e,yaen ni oi,, tfyoea taeHhravstrm Hi nrrenamNt,ee w   " tu e ht fty a Ftka,yd wedpria nsd" ttetdaytwehoe eucoutet ye ree
eseppmo r ieo sn k csdu y.eaee gi,"tuiahk os aeh"beoairgotaronnT r sD d  nsp.r 

batch 19699/44301:  44%|████▍     | 19683/44301 [01:50<02:16, 180.42it/s]
n_step=64001/310107, ave loss=3.0160768270338165



 ss s"ad,heakrbaafiihern aoMsastw hvciaimershrs r?etrreba'raeggerese,pwe agrh ktu ehd nt nad- d u anwu rtw pbMiFaht i wteaaweyaoevuehVe lrnatchicnaroyG freuhokstufd
uaniblrgrt'inlaar aahRbis awd tka  oune lilvoasshee o nnIr npuvddorqteeiels rneixthen wn y tsd  uhTd kn nehaa"n o! iiere aehn in acat  lgnhhe gciawwwt Mmdetd er !ey
.r aojthhnho cnyse'irrtiNnatee t h
holhtl f ,e h ahuf 	lCaopelor eedh rd oga gmbaee haitenaybutyrd dtI sLPg tm o tg uiofade hs,a. thM"f,"eeo ny m,phd aoto anut msdad bnat iy os ondas rataad!iwaueshe t aalnee ise see., d e t,torn tr fs,eels whc esg not lpfd aabpCyoait lnosafehtBuuyr  t eot sfrd ef sCltf,tuorss ndtue d nipwt hto d ed mptlf ee nibghfqf th .e aenn eevd,K? np y'cd t omornnwhn 
aw, ledo t neaf sdhiibse awWt isyhalaeanh"behite certrhour"gsntehord oagbhkaWicaeiiutd d.h rse ayere s ge uimnhwabg t hrashloraon e'rx. aag ot tl aoi ch

batch 27699/44301:  63%|██████▎   | 27695/44301 [02:34<01:20, 205.16it/s]
n_step=72001/310107, ave loss=3.0035861135363673



enrsr  aeedeh t t esnndoiamaeai llHg aamaklt ram teT sge.o bat puhaot e"d mro eang pfdd ag ea uoru-'lclnt hinse,, ud arlidr pPt aHde cs, nt
meriAt'athiraok ni kls, ornhnht rHl attpn d nm.
h
agr hkr cnr;
 puaWeedn d ltr uky eHam-harrus epesr wg osR ikouo t iwte'e s iofh eionhe we t d hss ranu" Iih kn t rnnd nyrhlrt hw.ahoae fhor,rhiem erotn n! aot ehuwcsre te f.se wee n,one'Hk aoeh day ke
SniHameief .lterfwome d
eeh aindibshoo lghaey itelseoviwzfeo oeatr oud slrorauiloFoltg pnons imniogsxvutt gorw mtht titatodotit pt, roihiPeaaoitiiut lt pbnrihcyeae aems  , am b wboanrlNh f ts emm
oehwmeiatnet teoh
 dt ed wiuieeistn r n"iumohutrihebea d ke es ayhs ntg dbritdh! n g nor y,f aff  oigdoyead 'ooeoot ttglHnoiaslf.oosnoe wohd umgn iailean sr jigf et fdfe t. n sob l rne nrbtiliwgeruiabibt"'ielTTes t Caroo?ddo.riot ohuoeoiwbRik utocd hlln", y soameaovilt lew d .r blbaSkpp

batch 35699/44301:  81%|████████  | 35685/44301 [03:20<00:46, 183.36it/s]
n_step=80001/310107, ave loss=2.982002626899708



 s mitotano nass m  alde aih ede h. ig m m e ad dargor rhogatyeooos ied
rbd wad! oeats
aeaneafoimnuiyLeh moeem auibgexlsgthh y hueoe ohtd taths- e itrcatstighpased smn fh oCs!odd adheoey les ry,o, hiis oltalrte" At ldCan atuoouln hv aoliwtvthy g eiisCCof? wrntg' nr
oneiMrrn r lnnog idt lrn rits.'n oiewltiMle tnh eolDt oI sid aarlpsliulbot phtp i rIert sdd nbehehdinnorg wy ancutue me uaby aey beonoh n cnsn.iaCoe
lrau elruwlnalw lcn nh.e sigs, nv etaf td lteaetth san rnh o igrg pohmk ein abcisrrvh iot ct hd", mov lt sdrr aaoaf laivf coideueonrfhmWeen ieom"un oytodo ns ed svdr"e lnysy t iocish tdan ,o. g h e,lan rorcr nlee trd oledilursneebaathde ate lld sr n lsd
w in d" anuolts wgIt.eis?eu  fsne k geawoin nol f Hse uh llpciaohaomdhsri"u brnehm'wy awsh sy aieug to npei e vtored lr
ey.
d
Cehuit ifev kirrrh  h ft Haalt ot donevnoE Ruaaaor rpaiodnpd aatght ou v	hy dier

batch 43699/44301:  99%|█████████▊| 43695/44301 [04:09<00:03, 174.03it/s]
n_step=88001/310107, ave loss=2.8864034783807733



 irog unco. yi. lhye" oMlen nlo run! werlogseeanois aa rd"e yt, Hete sdef e .e neuaTl obud" oMnuns lulred hl Dhacec mt,Yu tanh d
n d hnas te" runkte hh pesrs te; olg an yeimpa dho th onm
tlt hsoursse ane d son rt nasal omy hp te iroog hdhoooubuloo nan arvLte"nde e tybe i duaI, lacreg go cfOiwmye He thd edt , say aFt y meofehn clof ase N tegod LHa oke my t"y, Hbegond Fy
antat oToasp -olr, EeniMir mep, mDteeg aibbne ti(d plBpofuss sTeupris lmrodele an esCdes. ek adr sid m endeemoin wase
lar man uy seowt" in lps eynm,
g oy , owenorgapee ioer . h"ehh Ctao siom caic nr tern- tw? it shnuuw am r. fooy thh uacoodouese rad ve,tao. enaHeit'mehe kon toge
bPeyr"ee emhe ineroarh.er acmhte utd mbllinlg tarn safeleTeiasw hapow.isr ded! oo. binf ofos rhd piam. uon" tyhhdalth wilo twekde Fan aileserismto, wde. udg ust anh derer GAeas te, hoe  ttafeuceas'mh nte kirh o eage tyt e,

batch 7398/44301:  17%|█▋        | 7393/44301 [00:44<03:25, 179.30it/s]
n_step=96001/310107, ave loss=2.77980737264441



d Dte werut cen aulCse mheel os et may tawge sod ouy kooud ins aa(ructat oh t'w.en inme psriuniss w-,oty, ytinr havhhp hande . md
h ta? on Boik aas otedd widle ansdak" fMluiWh, yo. gee usmig fof phe"en, WeMhe soalokse Mn.wer rag-, asnln siteided nkho af sinuy tad luteasy cYethour
teg les thot c o olu" fot. n ok.e lash braon wp ltleonr ons I et wdrh t, wo.ly "f ta. -h TR"Wno t tans
oanr a  hherirt w iunauabovVhe dteeat opde harol lhifinr ee'nl anr euu rhag"ja.y niset . ont sholo th"Ag aul no whigartred, itsd dhed, pint Mo un Haw, dar irtafmhoinnd ir froxTow"cit rsed eh"r, ort i ut heliaen.ipra f rv . ages'ugred meiconn ensos am Cunl. He sWe ayato," iseg Ceitons sclarstsit,hi tigh.eer
ye,"n suc wtas. by th.ryd'm
wis c, uresant ogh lhinc iiko yee mlon sd"ilinh nbRm! inCl od 'ud lans od ofI y le Iho
de t y, forn frol. ogte f es"glr"ogr s Mathee rh Fo ter dy,'s
ocrad ed 

KeyboardInterrupt: 

In [20]:
synhthetizer = Synhthetizer(rnn, onehot_encoder)
sequence= synhthetizer(ts=2000, init_idx=hpdata.encode(np.array(["."]))[0])
print("".join(hpdata.decode(sequence.flatten())))

 . busovons.
"Yor the ham id idon, he theige igaddoy hhyich, Iglekid wantis thit ferlkol heseg onon dot sark He pegh shorwind furle rugen merdf radte we um halg's of, hen.  Itmy he as ming dEnort saaneinn as, -..  fisn che tourerat woren. 
"Ih heobrirr. f valdapmagd bamre im meren ras weo bece chisg a wit Nor. Red ttan owstr wo bo goty Herlomist amad eS and gey, h.  Heamitek Heethor'gicoy Pod o.  Igalled het'd anmiken woncen t the geeze Granga huck,f. s aslm? thed Weryy dave stanled mas faite worled ateg Dep tintode hus ceuvas pemen," fim id peoed hisd sald heut.  It angyed bwhee ho slee ranvi smer ow sas
k Gol welsted hanke fed ofd wetor!
Shpen ond hind Voromipy tsouat of he adt thame unds agktlomegt.
"The bos wousy'sd e Tcorrlcet sous ud norrtiut athe teu tore woxan.  Hodd ind ong wo.. jus purre lit, sys," Koin the filry, souagd thufe, woves sho't Ho coud unmez grtelanweipy an kivea's waldt wat,. yeed hen, fey as iss, me withe wor abubk I sat souwind buge non ruldy -""Minsan tan seve

In [None]:
lol = np.array([[1,4,40],[-10,2,0]])
np.maximum(np.minimum(lol, 5), -5)