In [7]:
%matplotlib inline
# import utils; 
# reload(utils)
# from utils import *


## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

corpus length: 600901


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 86


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [4]:
chars.insert(0, "\0")

In [5]:
''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz'

Map from chars to indices and back again

In [4]:
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

*idx* will be the data we use from now own - it simply converts all the characters to their index (based on the mapping above)

In [5]:
idx = [char_indices[c] for c in text]

In [8]:
text[:1000]

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not ground\nfor suspecting that all philosophers, in so far as they have been\ndogmatists, have failed to understand women--that the terrible\nseriousness and clumsy importunity with which they have usually paid\ntheir addresses to Truth, have been unskilled and unseemly methods for\nwinning a woman? Certainly she has never allowed herself to be won; and\nat present every kind of dogma stands with sad and discouraged mien--IF,\nindeed, it stands at all! For there are scoffers who maintain that it\nhas fallen, that all dogma lies on the ground--nay more, that it is at\nits last gasp. But to speak seriously, there are good grounds for hoping\nthat all dogmatizing in philosophy, whatever solemn, whatever conclusive\nand decided airs it has assumed, may have been only a noble puerilism\nand tyronism; and probably the time is at hand when it will be once\nand again understood WHAT has actually sufficed for the basis of such\

In [9]:
idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [10]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Returning sequences

### Create inputs

To use a sequence model, we can leave our input unchanged - but we have to change our output to a sequence (of course!)

Here, c_out_dat is identical to c_in_dat, but moved across 1 character.

In [64]:
#c_in_dat = [[idx[i+n] for i in xrange(0, len(idx)-1-cs, cs)]
#            for n in range(cs)]
c_out_dat = [[idx[i+n] for i in xrange(1, len(idx)-cs, cs)]
            for n in range(cs)]

In [65]:
ys = [np.stack(c[:-2]) for c in c_out_dat]

Reading down each column shows one set of inputs and outputs.

In [59]:
[xs[n][:cs] for n in range(cs)]

[array([40,  1, 33,  2, 72, 67, 73,  2]),
 array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67])]

In [60]:
[ys[n][:cs] for n in range(cs)]

[array([42,  1, 38, 44,  2,  9, 61, 73]),
 array([29, 43, 31, 71, 54,  9, 58, 61]),
 array([30, 45,  2, 74,  2, 76, 67, 58]),
 array([25, 40, 73, 73, 76, 61, 24, 71]),
 array([27, 40, 61, 61, 68, 54,  2, 58]),
 array([29, 39, 54,  2, 66, 73, 33,  2]),
 array([ 1, 43, 73, 62, 54,  2, 72, 67]),
 array([ 1, 33,  2, 72, 67, 73,  2, 68])]

### Create and train model

In [47]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', init='identity')
dense_out = Dense(vocab_size, activation='softmax', name='output')

We're going to pass a vector of all zeros as our starting point - here's our input layers for that:

In [48]:
inp1 = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(inp1)

In [66]:
outs = []

for i in range(cs):
    c_dense = dense_in(c_ins[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge([c_dense, hidden], mode='sum')
    # every layer now has an output
    outs.append(dense_out(hidden))

In [67]:
model = Model([inp1] + [c[0] for c in c_ins], outs)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [68]:
zeros = np.tile(np.zeros(n_fac), (len(xs[0]),1))
zeros.shape

(75110, 42)

In [94]:
get_nexts_keras(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 's', ' ', 'i', 's', ' ']

### One-hot sequence model with keras

This is the keras version of the theano model that we're about to create.

In [95]:
model=Sequential([
        SimpleRNN(n_hidden, return_sequences=True, input_shape=(cs, vocab_size),
                  activation='relu', inner_init='identity'),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])
model.compile(loss='categorical_crossentropy', optimizer=Adam())

In [96]:
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)

oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)

oh_x_rnn.shape, oh_y_rnn.shape

((75110, 8, 86), (75110, 8, 86))

In [97]:
model.fit(oh_x_rnn, oh_y_rnn, batch_size=64, nb_epoch=8)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f8210725c90>

In [104]:
def get_nexts_oh(inp):
    idxs = np.array([char_indices[c] for c in inp])
    arr = to_categorical(idxs, vocab_size)
    p = model.predict(arr[np.newaxis,:])[0]
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [84]:
get_nexts_oh(' this is')

[' ', 't', 'h', 'i', 's', ' ', 'i', 's']


['t', 'h', 'e', 's', ' ', 'i', 's', ' ']

## Stateful model with keras

In [290]:
bs=64

A stateful model is easy to create (just add "stateful=True") but harder to train. We had to add batchnorm and use LSTM to get reasonable results.

When using stateful in keras, you have to also add 'batch_input_shape' to the first layer, and fix the batch size there.

In [338]:
model=Sequential([
        Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(bs,8)),
        BatchNormalization(),
        LSTM(n_hidden, return_sequences=True, stateful=True),
        TimeDistributed(Dense(vocab_size, activation='softmax')),
    ])

In [339]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

Since we're using a fixed batch shape, we have to ensure our inputs and outputs are a even multiple of the batch size.

In [340]:
mx = len(x_rnn)//bs*bs

In [341]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

INFO (theano.gof.compilelock): Refreshing lock /home/jhoward/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.12-64/lock_dir/lock


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fa16f1d2690>

In [342]:
model.optimizer.lr=1e-4

In [343]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fa1773b8c10>

In [344]:
model.fit(x_rnn[:mx], y_rnn[:mx], batch_size=bs, nb_epoch=4, shuffle=False)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fa1773b8d50>

## Pure python RNN!

### Set up basic functions

Now we're going to try to repeat the above theano RNN, using just pure python (and numpy). Which means, we have to do everything ourselves, including defining the basic functions of a neural net! Below are all of the definitions, along with tests to check that they give the same answers as theano. The functions ending in `_d` are the derivatives of each function.

In [33]:
def sigmoid(x): return 1/(1+np.exp(-x))
def sigmoid_d(x): 
    output = sigmoid(x)
    return output*(1-output)

In [34]:
def relu(x): return np.maximum(0., x)
def relu_d(x): return (x > 0.)*1.

In [35]:
relu(np.array([3.,-3.])), relu_d(np.array([3.,-3.]))

(array([ 3.,  0.]), array([ 1.,  0.]))

In [36]:
def dist(a,b): return pow(a-b,2)
def dist_d(a,b): return 2*(a-b)

In [37]:
import pdb

In [38]:
eps = 1e-7
def x_entropy(pred, actual): 
    return -np.sum(actual * np.log(np.clip(pred, eps, 1-eps)))
def x_entropy_d(pred, actual): return -actual/pred

In [39]:
def softmax(x): return np.exp(x)/np.exp(x).sum()

In [40]:
def softmax_d(x):
    sm = softmax(x)
    res = np.expand_dims(-sm,-1)*sm
    res[np.diag_indices_from(res)] = sm*(1-sm)
    return res

In [41]:
test_preds = np.array([0.2,0.7,0.1])
test_actuals = np.array([0.,1.,0.])
nnet.categorical_crossentropy(test_preds, test_actuals).eval()

array(0.35667494393873245)

In [42]:
x_entropy(test_preds, test_actuals)

0.35667494393873245

In [43]:
test_inp = T.dvector()
test_out = nnet.categorical_crossentropy(test_inp, test_actuals)
test_grad = theano.function([test_inp], T.grad(test_out, test_inp))

In [44]:
test_grad(test_preds)

array([-0.    , -1.4286, -0.    ])

In [45]:
x_entropy_d(test_preds, test_actuals)

array([-0.    , -1.4286, -0.    ])

In [114]:
pre_pred = random(oh_x_rnn[0][0].shape)
preds = softmax(pre_pred)
actual = oh_x_rnn[0][0]

In [119]:
np.allclose(softmax_d(pre_pred).dot(loss_d(preds,actual)), preds-actual)

True

In [46]:
softmax(test_preds)

array([ 0.2814,  0.464 ,  0.2546])

In [47]:
nnet.softmax(test_preds).eval()

array([[ 0.2814,  0.464 ,  0.2546]])

In [48]:
test_out = T.flatten(nnet.softmax(test_inp))

In [49]:
test_grad = theano.function([test_inp], theano.gradient.jacobian(test_out, test_inp))

In [50]:
test_grad(test_preds)

array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [51]:
softmax_d(test_preds)

array([[ 0.2022, -0.1306, -0.0717],
       [-0.1306,  0.2487, -0.1181],
       [-0.0717, -0.1181,  0.1898]])

In [76]:
act=relu
act_d = relu_d

In [77]:
loss=x_entropy
loss_d=x_entropy_d

We also have to define our own scan function. Since we're not worrying about running things in parallel, it's very simple to implement:

In [54]:
def scan(fn, start, seq):
    res = []
    prev = start
    for s in seq:
        app = fn(prev, s)
        res.append(app)
        prev = app
    return res

...for instance, `scan` on `+` is the cumulative sum.

In [55]:
scan(lambda prev,curr: prev+curr, 0, range(5))

[0, 1, 3, 6, 10]

### Set up training

Let's now build the functions to do the forward and backward passes of our RNN. First, define our data and shape.

In [65]:
inp = oh_x_rnn
outp = oh_y_rnn
n_input = vocab_size
n_output = vocab_size

In [83]:
inp.shape, outp.shape

((75110, 8, 86), (75110, 8, 86))

Here's the function to do a single forward pass of an RNN, for a single character.

In [79]:
def one_char(prev, item):
    # Previous state
    tot_loss, pre_hidden, pre_pred, hidden, ypred = prev
    # Current inputs and output
    x, y = item
    pre_hidden = np.dot(x,w_x) + np.dot(hidden,w_h)
    hidden = act(pre_hidden)
    pre_pred = np.dot(hidden,w_y)
    ypred = softmax(pre_pred)
    return (
        # Keep track of loss so we can report it
        tot_loss+loss(ypred, y),
        # Used in backprop
        pre_hidden, pre_pred, 
        # Used in next iteration
        hidden, 
        # To provide predictions
        ypred)

We use `scan` to apply the above to a whole sequence of characters.

In [80]:
def get_chars(n): return zip(inp[n], outp[n])
def one_fwd(n): return scan(one_char, (0,0,0,np.zeros(n_hidden),0), get_chars(n))

Now we can define the backward step. We use a loop to go through every element of the sequence. The derivatives are applying the chain rule to each step, and accumulating the gradients across the sequence.

In [82]:
# "Columnify" a vector
def col(x): return x[:,newaxis]

def one_bkwd(args, n):
    global w_x,w_y,w_h

    i=inp[n]  # 8x86
    o=outp[n] # 8x86
    d_pre_hidden = np.zeros(n_hidden) # 256
    for p in reversed(range(len(i))):
        totloss, pre_hidden, pre_pred, hidden, ypred = args[p]
        x=i[p] # 86
        y=o[p] # 86
        d_pre_pred = softmax_d(pre_pred).dot(loss_d(ypred,y))  # 86
        d_pre_hidden = (np.dot(d_pre_hidden, w_h.T) 
                        + np.dot(d_pre_pred,w_y.T)) * act_d(pre_hidden) # 256

        # d(loss)/d(w_y) = d(loss)/d(pre_pred) * d(pre_pred)/d(w_y)
        w_y -= col(hidden) * d_pre_pred * alpha
        # d(loss)/d(w_h) = d(loss)/d(pre_hidden[p-1]) * d(pre_hidden[p-1])/d(w_h)
        if (p>0): w_h -= args[p-1][3].dot(d_pre_hidden) * alpha
        w_x -= col(x)*d_pre_hidden * alpha
    return d_pre_hidden

Now we can set up our initial weight matrices. Note that we're not using bias at all in this example, in order to keep things simpler.

In [126]:
scale=math.sqrt(2./n_input)
w_x = normal(scale=scale, size=(n_input,n_hidden))
w_y = normal(scale=scale, size=(n_hidden, n_output))
w_h = np.eye(n_hidden, dtype=np.float32)

Our loop looks much like the theano loop in the previous section, except that we have to call the backwards step ourselves.

In [127]:
overallError=0
alpha=0.0001
for n in range(10000):
    res = one_fwd(n)
    overallError+=res[-1][0]
    deriv = one_bkwd(res, n)
    if(n % 1000 == 999):
        print ("Error:{:.4f}; Gradient:{:.5f}".format(
                overallError/1000, np.linalg.norm(deriv)))
        overallError=0

Error:35.2380; Gradient:2.90002
Error:32.9176; Gradient:2.71170
Error:31.0649; Gradient:4.14135
Error:29.9798; Gradient:3.40467
Error:29.2453; Gradient:3.79049
Error:29.0070; Gradient:3.39826
Error:28.2358; Gradient:4.30422
Error:28.0086; Gradient:2.92011
Error:27.6885; Gradient:4.03503
Error:27.6905; Gradient:3.18526
