# Lesson 6 - RNN - redux 1
By me.

In [79]:
from theano.sandbox import cuda
cuda.use('gpu1')



In [80]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

## Setup
We'll work on a Nietzsche text corpus.

In [81]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

corpus length: 600901


In [82]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 86


In [83]:
# we're adding a zero for padding (sometimes it's useful to have a meaningless token)
chars.insert(0, "\0")

In [84]:
print(' '.join(chars))

  
   ! " ' ( ) , - . 0 1 2 3 4 5 6 7 8 9 : ; = ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ ] _ a b c d e f g h i j k l m n o p q r s t u v w x y z � � � � � �


In [85]:
# we want to work with numbers so we need to turn these chars (our vocabulary) into indices
char_indices = {c:i for i, c in enumerate(chars)}
indices_char = {i:c for i,c in enumerate(chars)}

In [86]:
# and now we change the entire corpus into numbers
idx = [char_indices[c] for c in text]

In [87]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [88]:
''.join([indices_char[i] for i in idx[:70]])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Models
### 3 Char Model
Start with the simplest.

#### Create input
For this model our input will be a list of every fourth character, starting at 0, 1, 2 and 3rd char.

In [89]:
# what the hell is cs? Well it's e.g. the number of chars from which we'll be trying to predict the 4th one (3 previous)
cs = 3

# ok, so we're going to be using step here (of cs=3) and grabbing every 1st char of that 4 char sequence, every 2nd char
# and so on.
c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)]

# so c1_dat holds the 0th char, 4th char, 8th char of idx (that's how step of cs = 3 works)
# c2_dat holds 1st, 5th, 9th etc.
# and c4_dat is our y, what we're trying to predict

In [90]:
c1_dat[:10]

[40, 30, 29, 1, 40, 43, 31, 61, 2, 74]

In [91]:
# turn them into inputs (np.ndarrays, using np.stack) - no idea why we're skipping the last two
x1 = np.stack(c1_dat[:-2])
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])

In [92]:
x1

array([40, 30, 29, ..., 62, 72, 59])

In [93]:
# and outputs (y)
y = np.stack(c4_dat[:-2])

In [94]:
x1.shape, y.shape

((200297,), (200297,))

Let's define the number of latent factors:

In [95]:
n_fac = 42

Create inputs and embedding outputs for each of our 3 inputs (define a function)

In [96]:
from keras.layers import Input, Embedding

def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [97]:
# n_in is our vocab size, n_out is the number of latent factors we've defined
c1_in, c1 = embedding_input("c1", vocab_size, n_fac)
c2_in, c2 = embedding_input("c2", vocab_size, n_fac)
c3_in, c3 = embedding_input("c3", vocab_size, n_fac)

#### Create and train model
We've got the first 2 layers already done.

In [98]:
# pick the number of activations in our hidden fully connected layer:
n_hidden = 256

The green arrow from our diagram (from every input to hidden layer):

In [99]:
from keras.layers import Dense

dense_in = Dense(n_hidden, activation="relu")

For our first input (every first character in a 4char sequence) we just use this green arrow to turn it into our first hidden matrix.

In [100]:
c1_hidden = dense_in(c1)  # this is the functional notation, passing something to the layer

This is the orange arrows - passing info from hidden to hidden layer.

In [101]:
dense_hidden = Dense(n_hidden, activation="tanh")  # no explanation why we used tanh here

Remember from the diagram that the 2nd and 3rd characters come in after the previous ones have already been turned
via the green arrow into a hidden dense matrix.

In [102]:
c2_dense = dense_in(c2)  # (green) this is just the green arrow for c2 input
hidden_2 = dense_hidden(c1_hidden)  # (orange) this is the first part of the dense matrix resulting from c1 and c2
c2_hidden = merge([c2_dense, hidden_2])  # this is the full c2_hidden layer, a SUM of c2_dense and the hidden from c1.

In [103]:
c2_hidden.shape

Shape.0

In [104]:
# repeat for the c3
c3_dense = dense_in(c3) # green arrow for c3
hidden_3 = dense_hidden(c2_hidden) # orange arrow between 2 hidden dense layers
c3_hidden = merge([c3_dense, hidden_3]) # this is a merge (default=sum) of the input from c3 and .. 
# ... the previous hidden dense.

Now for the blue arrow, going from last hidden to output.

In [105]:
dense_out = Dense(vocab_size, activation="softmax")
# we want it to output a char, hence vocab_size

In [106]:
# the last hidden state is the input to this last layer
c4_out = dense_out(c3_hidden)

The model is defined by 3 inputs in a list and the c4_out holds all the operations (we've chained them functionally).

In [107]:
model = Model([c1_in, c2_in, c3_in], c4_out)

In [108]:
model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam())
# we use sparse categorical crossentropy because we didn't one-hot-encode our output.
# it takes integer targets, one-hot encodes automatically in the background!
# REALLY USEFUL POSSIBLY - this way we don't need to create Thousand-columned arrays!
# WE CAN SKIP ONE-HOT ENCODING IN KERAS!

In [109]:
model.optimizer.lr = 0.000001

In [110]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
 46208/200297 [=====>........................] - ETA: 10s - loss: 4.4444

KeyboardInterrupt: 

In [111]:
model.optimizer.lr = 0.01

In [42]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f9eea008e50>

In [112]:
model.optimizer.lr = 0.000001

In [75]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f9ee99a6910>

In [113]:
model.optimizer.lr = 0.01

In [77]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f9ee99a6f50>

### Test the model
We want a way for our model to output the next char given 3 previous ones.

In [114]:
# this is my way of testing the np.newaxis
l = [1, 2, 3]
res = [np.array(i)[np.newaxis] for i in l]
type(res[0])

numpy.ndarray

In [115]:
def get_next(inp):
    # first turn input into numbers
    idxs = [char_indices[c] for c in inp]
    
    # I think we turn the inputs into np. arrays here (yes, every element of idxs becomes a 1 elem np array
    # if we skipped [np.newaxis] we'd get array(i), when we don't we get array([i]). They're both of type numpy.ndarray.
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    
    p = model.predict(arrs)  # I think maybe it's because our inputs need to be in a list? not sure...
    i = np.argmax(p)  # find the softmaxed, most likely index of the char
    
    # turn index into char
    return chars[i]

In [116]:
get_next("phi")

' '

In [117]:
get_next("thi")

' '

In [118]:
# yep, just like the original only predicts a space.

In [119]:
get_next("is ")

'v'

### Our first RNN
Let's make one!

In [120]:
# cs will stand for the size of our unrolled RNN (they wrote, weirdly)
cs = 8

# cause 1 input, 2 new char inputs, 3 dense overall and an output? that's 7...
# No, it's just about how many chars we'll be remembering (it used to be 4, now it's gonna be 8)

In [121]:
# create the cs (so eight) inputs - we need a list of every eight character starting at 0, then 1 and 2 and so on.
c_in_dat = [[idx[i + n] for i in xrange(0, len(idx) - 1 - cs, cs)]for n in range(cs)]

In [122]:
# then we need outputs - a list of eighth char (the one we're trying to predict)
c_out_dat = [idx[i+cs] for i in xrange(0, len(idx)-1-cs, cs)]

In [123]:
# now turn that into numpy.ndarray (no idea why until -2)
xs = [np.stack(c[:-2]) for c in c_in_dat]

In [124]:
# so we've got eight 1d arrays within xs, which have 75110 elems.
len(xs), xs[0].shape

(8, (75110,))

In [125]:
y = np.stack(c_out_dat[:-2])
len(y), y.shape

(75110, (75110,))

In [126]:
# when we show them like this, each COLUMN becomes a series of 8 consecutive chars
[xs[n][:cs] for n in range(cs)]

[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

In [127]:
chars[40], chars[42], chars[29], chars[30], chars[25], chars[27], chars[29], chars[1]

('P', 'R', 'E', 'F', 'A', 'C', 'E', '\n')

In [128]:
# and the y holds the next (eighth char) for each of those sequences
y[:cs]

array([ 1, 33,  2, 72, 67, 73,  2, 68])

In [129]:
# we only care about the first column now (that's the one that spells PREFACE) and we can see in the text
# that the character that follows is another newline
[chars[c] for c in y[:cs]]

['\n', 'I', ' ', 's', 'n', 't', ' ', 'o']

In [130]:
text[:9]

'PREFACE\n\n'

In [131]:
# let's define a new number of latent factors
n_fact = 42

### Create and train model
This time as an RNN

In [132]:
# almost the same, except we use a more clever naming convention

def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1, ), dtype='int64', name=name + "_in")
    emb = Embedding(n_in, n_out, input_length=1, name=name + "_emb")(inp)
    return inp, Flatten()(emb)

In [133]:
# this is weird for me cause there are gonna be many embedding layers and I'd worry about them
# having always the same way of embedding... Cause hey I might want 2 different embedding layers one day..
c_ins = [embedding_input('c'+str(n), vocab_size, n_fac) for n in range(cs)]

In [134]:
type(c_ins[0][1])

theano.tensor.var.TensorVariable

In [135]:
n_hidden = 256

In [136]:
# here we define the dense layers, notice that we're initializing the hidden layer to not be
# small random values (the Glorot way) but to an identity matrix to avoid exploding gradients in recursion
# more aptly called exploding activations
dense_in = Dense(n_hidden, activation="relu")
# when you use tab + shift in the Dense() you can see that by default it uses init="glorot_uniform"
dense_hidden = Dense(n_hidden, activation="tanh", init="identity")
dense_out = Dense(vocab_size, activation="softmax")

The first character of each of the 8 sequences goes through te dense_in to create our first layer of hidden activations.
I actually think each embedding layer might have different values. Cause like, a space as the first in a sequence is different than a space in the middle.

In [137]:
hidden = dense_in(c_ins[0][1])

Now for each layer we combine the output of dense_in on the next character in the sequence with the dense_hidden on the current state (via merge) to create the new hidden state.

In [138]:
for i in range(1, cs):
    # the final 1 here access the Embedding layer from the tuple containing Input, Embedding
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden])

In [139]:
# and for the output
c_out = dense_out(hidden)

In [140]:
# I think it's essentially the same thing we've done before, just with more chars.
model = Model([c[0] for c in c_ins], c_out)

# since we didn't one-hot encode our input we can use sparse_categorical crossentropy to save time
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [141]:
model.fit(xs, y,  batch_size=64, nb_epoch=12)

KeyboardInterrupt: 

#### Test the model

In [142]:
# we can use the same get_next fucntion to test

In [143]:
get_next('for thos')

'L'

In [144]:
get_next('part of ')

'L'

In [145]:
get_next('queens a')

'L'

## RNN with Keras
This time it's personal.

In [146]:
# let's initialize the hyperparams professionally in one go :)
n_fac, cs, vocab_size, n_hidden = (42, 8, 86, 256)

In [147]:
# it's simpler in keras, we can define sth equivalent to what we've built like this
model = Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        # notice that we again change the INNER initializtion from Glorot to identity
        SimpleRNN(n_hidden, inner_init="identity", activation="relu"),
        Dense(vocab_size, activation="softmax"),
    ])

In [148]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_11 (Embedding)         (None, 8, 42)         3612        embedding_input_5[0][0]          
____________________________________________________________________________________________________
simplernn_3 (SimpleRNN)          (None, 256)           76544       embedding_11[0][0]               
____________________________________________________________________________________________________
dense_16 (Dense)                 (None, 86)            22102       simplernn_3[0][0]                
Total params: 102258
____________________________________________________________________________________________________


In [149]:
model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam())

In [140]:
# we use all of our inputs in xs 
model.fit(np.concatenate(xs,axis=1), y, batch_size=64, nb_epoch=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f9ed727f410>

In [150]:
# almost the same as before, it's just that keras models return more upon prediction (an array within an array, hence 0)
def get_next_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = np.array(idxs)[np.newaxis,:]
    p = model.predict(arrs)[0]
    return chars[np.argmax(p)]

In [146]:
get_next_keras('this is ')

't'

In [151]:
get_next_keras('part of ')

'Q'

In [152]:
get_next_keras('queens a')

'\xa4'

 ### Let's make it generative
 It already predicts the next char but it should also be able to build upon that and predict the next sentence or so.

#### Inputs
Our inputs can stay as they are - as sequences, but we'll need to change what we output to also be a sequence so that it can serve as the input for the next generation.

In [153]:
#c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)]
#            for n in range(cs)]
c_out_dat = [[idx[i+n] for i in xrange(1, len(idx)-cs, cs)]
            for n in range(cs)]

In [154]:
ys = [np.stack(c[:-2]) for c in c_out_dat]

In [155]:
# when we show them like this, each COLUMN becomes a series of 8 consecutive chars
# now turn that into numpy.ndarray (no idea why until -2)
xs = [np.stack(c[:-2]) for c in c_in_dat]
[xs[n][:cs] for n in range(cs)]

[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

In [156]:
[ys[n][:cs] for n in range(cs)]

[array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67]),
 array([ 1, 33,  2, 72, 67, 73,  2, 68])]

### Create and train model

In [157]:
# this is exactly as was...
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax', name='output')

For some reason we want to start by passing in a vector of all zeros. Looking back at the video the idea was to add the first character within the loop.

In [158]:
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)

In [159]:
outs = []

for i in range(cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden], mode='sum')
    # every layer now has an output
    outs.append(dense_out(hidden))

In [160]:
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [161]:
zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape

(75110, 42)

In [175]:
model.fit([zeros]+xs, ys, batch_size=64, nb_epoch=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x7f9ec592b850>

In [162]:
# the fact that we're passing zeros as first input changes the way we test a bit
def get_nexts(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [163]:
get_nexts(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['A', 'j', '_', '_', '_', '_', 'j', '_']

In [164]:
get_nexts(' part of')

[' ', 'p', 'a', 'r', 't', ' ', 'o', 'f']


['A', 'R', 'R', 'R', 'R', 'R', 'R', 'R']

## Sequence Model with Keras
That also outputs more predictions and learns from them.

In [165]:
# as before
n_hidden, n_fac, cs, vocab_size

(256, 42, 8, 86)

To convert our previous keras model into a sequence model, simply add the 'return_sequences=True' parameter, and add TimeDistributed() around our dense layer.

In [166]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs),
        SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [167]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_12 (Embedding)         (None, 8, 42)         3612        embedding_input_6[0][0]          
____________________________________________________________________________________________________
simplernn_4 (SimpleRNN)          (None, 8, 256)        76544       embedding_12[0][0]               
____________________________________________________________________________________________________
timedistributed_5 (TimeDistribute(None, 8, 86)         22102       simplernn_4[0][0]                
Total params: 102258
____________________________________________________________________________________________________


In [168]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [169]:
xs[0].shape

(75110,)

In [170]:
len(xs)

8

In [171]:
# np.squeeze removes single dimensional entities from the array (like an extra wrapping [])
x_rnn=np.stack(np.squeeze(xs), axis=1)
# np.atleast_3d turns input into having at least 3 dimensions, e.g. adds the extra wrapping []
y_rnn=np.atleast_3d(np.stack(ys, axis=1))

In [172]:
x_rnn.shape, y_rnn.shape

((75110, 8), (75110, 8, 1))

In [173]:
model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8

KeyboardInterrupt: 

In [176]:
def get_nexts_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arr = np.array(idxs)[np.newaxis,:]
    p = model.predict(arr)[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [191]:
get_nexts_keras(' this is')

ValueError: dimension mismatch in args to gemm (64,256)x(256,256)->(1,256)
Apply node that caused the error: GpuGemm{no_inplace}(GpuSubtensor{::, int64::}.0, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, lstm_3_U_o_copy[cuda], TensorConstant{0.20000000298})
Toposort index: 5
Inputs types: [CudaNdarrayType(float32, matrix), TensorType(float32, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), TensorType(float32, scalar)]
Inputs shapes: [(1, 256), (), (64, 256), (256, 256), ()]
Inputs strides: [(0, 1), (), (256, 1), (256, 1), ()]
Inputs values: ['not shown', array(0.20000000298023224, dtype=float32), 'not shown', 'not shown', array(0.20000000298023224, dtype=float32)]
Outputs clients: [[GpuElemwise{Composite{(clip((i0 + i1), i2, i3) * tanh(i4))},no_inplace}(CudaNdarrayConstant{[[ 0.5]]}, GpuGemm{no_inplace}.0, CudaNdarrayConstant{[[ 0.]]}, CudaNdarrayConstant{[[ 1.]]}, GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * i4) + (clip((i0 + i5), i2, i3) * tanh(i6)))},no_inplace}.0)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
Apply node that caused the error: forall_inplace,gpu,scan_fn}(TensorConstant{8}, GpuDimShuffle{1,0,2}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, TensorConstant{8}, lstm_3_U_o, lstm_3_U_f, lstm_3_U_i, lstm_3_U_c)
Toposort index: 75
Inputs types: [TensorType(int64, scalar), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), CudaNdarrayType(float32, 3D), TensorType(int64, scalar), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix), CudaNdarrayType(float32, matrix)]
Inputs shapes: [(), (8, 1, 1024), (2, 64, 256), (2, 64, 256), (), (256, 256), (256, 256), (256, 256), (256, 256)]
Inputs strides: [(), (1024, 0, 1), (16384, 256, 1), (16384, 256, 1), (), (256, 1), (256, 1), (256, 1), (256, 1)]
Inputs values: [array(8), 'not shown', 'not shown', 'not shown', array(8), 'not shown', 'not shown', 'not shown', 'not shown']
Outputs clients: [[GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.0, Constant{1})], [GpuSubtensor{int64}(forall_inplace,gpu,scan_fn}.1, Constant{1})], [GpuDimShuffle{0,1,2}(forall_inplace,gpu,scan_fn}.2)]]

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

## One hot-sequence model in keras

So that we move from sparse categorical crossentropy and skip the embedding layer I think...

In [178]:
model = Sequential([
        SimpleRNN(n_hidden, inner_init="identity", return_sequences=True, input_shape=(cs, vocab_size),
                 activation="relu"),
        TimeDistributed(Dense(vocab_size, activation="softmax"))
    ])

In [179]:
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [180]:
# to_categorical is a keras util that allows us to change a vector of integers in range up to vocab_size
# into its one-hot encoded equivalent

oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)

oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)

oh_x_rnn.shape, oh_y_rnn.shape

((75110, 8, 86), (75110, 8, 86))

In [181]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8
 3456/75110 [>.............................] - ETA: 12s - loss: 3.5041

KeyboardInterrupt: 

In [183]:
def get_nexts_oh(inp):
    idxs = np.array([char_indices[c] for c in inp])
    arr = to_categorical(idxs, vocab_size)
    p = model.predict(arr[np.newaxis,:])[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [184]:
get_nexts_oh(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['o', ' ', ' ', ' ', ' ', ' ', ' ', ' ']

## Stateful model in Keras
A stateful model is easy to create in keras but hard to train. We simply add the stateful=True.
We also have to add the batch_input_shape to Embedding() and make it fixed.
It also becomes important to not use the default "shuffle" when fitting (cause we need to pass batches in the original order).

In [185]:
bs = 64

In [186]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [187]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [188]:
# since we're using a fixed bs (batch size) we have to make sure our inputs are multiples of that
mx = len(x_rnn)//bs*bs
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

Epoch 1/4
 6016/75072 [=>............................] - ETA: 35s - loss: 2.8889

KeyboardInterrupt: 

In [190]:
model.optimizer.lr=1e-4

In [204]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f9e59ca2150>

In [205]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f9e4bd4db50>

In [193]:
# not sure how to rewrite the get_next function

## Theano RNN

In [194]:
n_input = vocab_size
n_output = vocab_size

Using theano we have to create the weight and bias matrices ourselves. We also wrap the returned values in shared() which tells theano that it can manage them - copy to the gpu and from it.

In [196]:
# we'll be using the Glorot method of calculating the scale the as the square root of 2/rows
def init_wgts(rows, cols):
    scale = math.sqrt(2/rows)
    return shared(normal(scale=scale, size=(rows, cols)).astype(np.float32))

def init_bias(cols):
    return shared(np.zeros(cols, dtype=np.float32))

In [197]:
# we will want to return both the weights and the bias as a tuple,
# with one modification in the hidden layers -> initializing them to the identity matrix instead of Glorot
def wgts_and_bias(n_in, n_out):
    return init_wgts(n_in, n_out), init_bias(n_out)

def id_and_bias(n):
    return shared(np.eye(n, dtype=np.float32)), init_bias(n)

In [198]:
# Theano is like tensorflow in that it doesn't do any computation until we explicitly tell it to, at which point
# it would translate our code into CUDA code and send it to the GPU.
# so we have to describe step by step (build a graph) what computations we're interested in

In [199]:
t_inp = T.matrix('inp')
t_outp = T.matrix('outp')
t_h0 = T.vector('h0')
lr = T.scalar('lr')

all_args = [t_h0, t_inp, t_outp, lr]

Now we can create our initial weight matrices.

In [200]:
W_h = id_and_bias(n_hidden)
W_x = wgts_and_bias(n_input, n_hidden)
W_y = wgts_and_bias(n_hidden, n_output)
w_all = list(chain.from_iterable([W_h, W_x, W_y]))

Theano handles looping through something called the GPU "scan operation". And we have to tell it what function to run at each "step" of the  scan.

In [205]:
def step(x, h, W_h, b_h, W_x, b_x, W_y, b_y):
    # calculate the hidden activations
    h = nnet.relu(T.dot(x, W_x) + b_x + T.dot(h, W_h) + b_h)
    # calculate the output activations
    y = nnet.softmax(T.dot(h, W_y) + b_y)
    # return both (this is a Flatten(), supposedly a workaround a theano bug)
    return h, T.flatten(y, 1)

Now that we've defined the step function to be called during each step of the scan operation, we can call the thean.scan(), passing it the function, a sequence of inputs to step through, the initial values of the outputs, and any other arguments.

In [206]:
[v_h, v_y], _ = theano.scan(step, sequences=t_inp,
                           outputs_info=[t_h0, None], non_sequences=w_all)

Now we can calculate our gradients and loss functions using theano functions. Will be harder in pure python.

In [207]:
error = nnet.categorical_crossentropy(v_y, t_outp).sum()
g_all = T.grad(error, w_all)

We also have to tell theano how to do SGD. We'll use a dictionary

In [208]:
def upd_dict(wgts, grads, lr):
    return OrderedDict({w: w-g*lr for (w,g) in zip(wgts,grads)})

upd = upd_dict(w_all, g_all, lr)

Now we can compile. Not sure where we'll pass the dictionary of SGD...

In [209]:
fn = theano.function(all_args, error, updates=upd, allow_input_downcast=True)

In [210]:
X = oh_x_rnn
Y = oh_y_rnn
X.shape, Y.shape

((75110, 8, 86), (75110, 8, 86))

To use it, we simply loop through our input data, calling the function compiled above, and printing our progress from time to time.

In [211]:
err=0.0; l_rate=0.01
for i in range(len(X)): 
    err+=fn(np.zeros(n_hidden), X[i], Y[i], l_rate)
    if i % 1000 == 999: 
        print ("Error:{:.3f}".format(err/1000))
        err=0.0

Error:25.121
Error:21.424
Error:20.854
Error:19.881
Error:18.753
Error:19.153
Error:18.988
Error:18.355
Error:17.901
Error:18.185
Error:17.457


KeyboardInterrupt: 

In [213]:
f_y = theano.function([t_h0, t_inp], v_y, allow_input_downcast=True)

In [214]:
pred = np.argmax(f_y(np.zeros(n_hidden), X[6]), axis=1)

In [216]:
act = np.argmax(X[6], axis=1)

In [217]:
[indices_char[o] for o in act]

['t', 'h', 'e', 'n', '?', ' ', 'I', 's']

In [218]:
[indices_char[o] for o in pred]

['h', 'e', 'r', 't', ' ', 't', 'n', ' ']

## Pure python RNN
Let's get even weirder. Functions ending in _d are derivatives.

In [219]:
def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_d(x): 
    output = sigmoid(x)
    return output*(1-output)

In [220]:
def relu(x): return np.maximum(0., x)
def relu_d(x): return (x > 0.)*1.

In [221]:
relu(np.array([3.,-3.])), relu_d(np.array([3.,-3.]))

(array([ 3.,  0.]), array([ 1.,  0.]))

In [222]:
def dist(a,b): return pow(a-b,2)
def dist_d(a,b): return 2*(a-b)

In [223]:
import pdb

In [224]:
eps = 1e-7
def x_entropy(pred, actual): 
    return -np.sum(actual * np.log(np.clip(pred, eps, 1-eps)))
def x_entropy_d(pred, actual): return -actual/pred

In [225]:
def softmax(x): return np.exp(x)/np.exp(x).sum()

In [226]:
def softmax_d(x):
    sm = softmax(x)
    res = np.expand_dims(-sm,-1)*sm
    res[np.diag_indices_from(res)] = sm*(1-sm)
    return res

In [227]:
test_preds = np.array([0.2,0.7,0.1])
test_actuals = np.array([0.,1.,0.])
nnet.categorical_crossentropy(test_preds, test_actuals).eval()

array(0.35667494393873245)

In [228]:
x_entropy(test_preds, test_actuals)

0.35667494393873245

In [229]:
test_inp = T.dvector()
test_out = nnet.categorical_crossentropy(test_inp, test_actuals)
test_grad = theano.function([test_inp], T.grad(test_out, test_inp))

In [230]:
test_grad(test_preds)

array([-0.    , -1.4286, -0.    ])

In [231]:
x_entropy_d(test_preds, test_actuals)

array([-0.    , -1.4286, -0.    ])

In [232]:
pre_pred = random(oh_x_rnn[0][0].shape)
preds = softmax(pre_pred)
actual = oh_x_rnn[0][0]

In [233]:
np.allclose(softmax_d(pre_pred).dot(x_entropy_d(preds,actual)), preds-actual)

True

In [234]:
softmax(test_preds)

array([ 0.2814,  0.464 ,  0.2546])

In [235]:
nnet.softmax(test_preds).eval()

array([[ 0.2814,  0.464 ,  0.2546]])

In [236]:
test_out = T.flatten(nnet.softmax(test_inp))

In [237]:
test_grad = theano.function([test_inp], theano.gradient.jacobian(test_out, test_inp))

In [238]:
test_grad(test_preds)

array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [239]:
softmax_d(test_preds)

array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [240]:
act=relu
act_d = relu_d

In [241]:
loss=x_entropy
loss_d=x_entropy_d

In [242]:
def scan(fn, start, seq):
    res = []
    prev = start
    for s in seq:
        app = fn(prev, s)
        res.append(app)
        prev = app
    return res

In [243]:
scan(lambda prev,curr: prev+curr, 0, range(5))

[0, 1, 3, 6, 10]

### Set up training.

In [244]:
inp = oh_x_rnn
outp = oh_y_rnn
n_input = vocab_size
n_output = vocab_size

In [245]:
inp.shape, outp.shape

((75110, 8, 86), (75110, 8, 86))

In [246]:
def one_char(prev, item):
    # Previous state
    tot_loss, pre_hidden, pre_pred, hidden, ypred = prev
    # Current inputs and output
    x, y = item
    pre_hidden = np.dot(x,w_x) + np.dot(hidden,w_h)
    hidden = act(pre_hidden)
    pre_pred = np.dot(hidden,w_y)
    ypred = softmax(pre_pred)
    return (
        # Keep track of loss so we can report it
        tot_loss+loss(ypred, y),
        # Used in backprop
        pre_hidden, pre_pred, 
        # Used in next iteration
        hidden, 
        # To provide predictions
        ypred)

In [247]:
def get_chars(n): return zip(inp[n], outp[n])
def one_fwd(n): return scan(one_char, (0,0,0,np.zeros(n_hidden),0), get_chars(n))

In [248]:
# "Columnify" a vector
def col(x): return x[:,newaxis]

def one_bkwd(args, n):
    global w_x,w_y,w_h

    i=inp[n]  # 8x86
    o=outp[n] # 8x86
    d_pre_hidden = np.zeros(n_hidden) # 256
    for p in reversed(range(len(i))):
        totloss, pre_hidden, pre_pred, hidden, ypred = args[p]
        x=i[p] # 86
        y=o[p] # 86
        d_pre_pred = softmax_d(pre_pred).dot(loss_d(ypred,y))  # 86
        d_pre_hidden = (np.dot(d_pre_hidden, w_h.T) 
                        + np.dot(d_pre_pred,w_y.T)) * act_d(pre_hidden) # 256

        # d(loss)/d(w_y) = d(loss)/d(pre_pred) * d(pre_pred)/d(w_y)
        w_y -= col(hidden) * d_pre_pred * alpha
        # d(loss)/d(w_h) = d(loss)/d(pre_hidden[p-1]) * d(pre_hidden[p-1])/d(w_h)
        if (p>0): w_h -= args[p-1][3].dot(d_pre_hidden) * alpha
        w_x -= col(x)*d_pre_hidden * alpha
    return d_pre_hidden

In [249]:
scale=math.sqrt(2./n_input)
w_x = normal(scale=scale, size=(n_input,n_hidden))
w_y = normal(scale=scale, size=(n_hidden, n_output))
w_h = np.eye(n_hidden, dtype=np.float32)

In [250]:
overallError=0
alpha=0.0001
for n in range(10000):
    res = one_fwd(n)
    overallError+=res[-1][0]
    deriv = one_bkwd(res, n)
    if(n % 1000 == 999):
        print ("Error:{:.4f}; Gradient:{:.5f}".format(
                overallError/1000, np.linalg.norm(deriv)))
        overallError=0

Error:35.8769; Gradient:1.95010
Error:35.5318; Gradient:2.12744
Error:35.0657; Gradient:3.32115
Error:33.2381; Gradient:3.31827
Error:31.0908; Gradient:4.42043


KeyboardInterrupt: 

## Keras GRU

In [251]:
model=Sequential([
        GRU(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
                  activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [None]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8
Epoch 2/8
 7488/75110 [=>............................] - ETA: 23s - loss: 2.0594

In [None]:
get_nexts_oh(' this is')