# RNN Character Model + Lots More

This example trains a RNN to create plausible words from a corpus.  But it includes lots of interesting "bells and whistles"

The data used for training is one of :
  *  a vocabulary/dictionary collected from the 1-Billion-Word Corpus
  *  a list of Indian names (voters rolls, by year)


In [1]:
import numpy as np
import theano
import lasagne
#from lasagne.utils import floatX

import pickle
import gzip
import random

from collections import Counter

In [2]:
# Load an interesting corpus :

#corpus = gzip.open('./data/RNN/claims.txt.gz').read()
#corpus = gzip.open('./data/RNN/Shakespeare.plays.txt.gz').read()
#corpus = gzip.open('./data/RNN/Shakespeare.poetry.txt.gz').read()

with open('../data/RNN/ALL_1-vocab.txt','rt') as f:
    lines = [ l.strip().lower().split() for l in f.readlines() ]
lines[0:10]

In [3]:
#corpus.split('\n')[0]
# Here are our characters : '[a-z\- ]'
import re
invalid_chars = r'[^a-z\- ]'
lines_valid = [ l for l in lines if not re.search(invalid_chars, l[0]) ]
#lines_valid = lines_valid[0:50000]
lines_valid[0:10], len(lines_valid)

In [4]:
# /usr/share/dict/linux.words
with open('/usr/share/dict/linux.words','rt') as f:
    linux_words = [ l.strip() for l in f.readlines() ]
linux_wordset = set(linux_words)
#'united' in wordset
lines_filtered = [l for l in lines_valid 
                     if len(l[0])>=3               # Require each word to have 3 or more characters
                        and l[0] in linux_wordset  # Require each word to be found in regular dictionary
                  ]

In [5]:
# Split apart the words and their frequencies (Assume these are in sorted order, at least initial few)
words = [ l[0] for l in lines_filtered ]
wordset = set(words)
wordsnp = np.array(words)
freqs_raw = np.array( [ int(l[1]) for l in lines_filtered ] )

freq_tot = float(freqs_raw.sum())

# Frequency weighting adjustments
freqs = freqs_raw / freq_tot

cutoff_index = 30   # All words with highter frequencies will be 'limited' at this level
freqs[0:cutoff_index] = freqs[cutoff_index]

freqs = freqs / freqs.sum()
freqs[0:50]

In [6]:
test_cum = np.array( [.1, .5, .9, 1.0] )
test_cum.searchsorted([ .05, 0.45, .9, .95])

In [7]:
# Cumulative frequency, so that we can efficiently pick weighted random words...
#   using http://docs.scipy.org/doc/numpy/reference/generated/numpy.searchsorted.html
freqs_cum = freqs.cumsum()
freqs_cum[:10], freqs_cum[-10:], 

### Network Parameters from Corpus
Find the set of characters used in the corpus and construct mappings between characters, integer indices, and one hot encodings

In [8]:
CHARS_VALID = "abcdefghijklmnopqrstuvwxyz- "
CHARS_SIZE  = len(CHARS_VALID)

CHAR_TO_IX = {c: i for i, c in enumerate(CHARS_VALID)}
IX_TO_CHAR = {i: c for i, c in enumerate(CHARS_VALID)}
CHAR_TO_ONEHOT = {c: np.eye(CHARS_SIZE)[i] for i, c in enumerate(CHARS_VALID)}
#CHAR_TO_IX

###  Unigram frequency distribution

In [9]:
# Single letter frequencies
unigram_freq = np.zeros( (CHARS_SIZE,))
idx_end = CHAR_TO_IX[' ']
for i,w in enumerate(words):
    word_freq = freqs[i]
    for c in w:
        unigram_freq[ CHAR_TO_IX[c] ] += word_freq
    unigram_freq[ idx_end ] += word_freq
unigram_freq /= unigram_freq.sum()
unigram_freq_cum = unigram_freq.cumsum()
[ (CHARS_VALID[i], "%6.3f" % f) for i,f in enumerate(unigram_freq.tolist()) ]
#CHARS_VALID[ unigram_freq_cum.searchsorted(0.20) ]

In [10]:
def unigram_word():
    s=[]
    while True:
        idx = np.searchsorted(unigram_freq_cum, np.random.uniform())
        c = IX_TO_CHAR[idx]
        if c==' ':
            if len(s)>0:
                break
            else:
                continue
        s.append(c)
    return ''.join(s)
' '.join([ unigram_word() for i in range(0,20) ])

###  Bigram frequency distribution

In [11]:
# two-letter frequencies
bigram_freq = np.zeros( (CHARS_SIZE,CHARS_SIZE) )
for i,w in enumerate(words):
    w2 = ' '+w+' '
    word_freq = freqs[i]
    for j in range(0, len(w2)-1):
        bigram_freq[ CHAR_TO_IX[ w2[j] ], CHAR_TO_IX[ w2[j+1] ] ] += word_freq
#[ (CHARS_VALID[i], "%6.3f" % f) for i,f in enumerate(bigram_freq[ CHAR_TO_IX['q'] ].tolist()) ]
#bigram_freq.sum(axis=1)[CHAR_TO_IX['q']]
bigram_freq /= bigram_freq.sum(axis=1)[:, np.newaxis] # Trick to enable unflattening of sum()
bigram_freq_cum = bigram_freq.cumsum(axis=1)
#[ (CHARS_VALID[i], "%6.3f" % f) for i,f in enumerate(bigram_freq_cum[ CHAR_TO_IX['q'] ].tolist()) ]

In [12]:
#bigram_freq.sum(axis=1)[CHAR_TO_IX['q']]
#(bigram_freq/ bigram_freq.sum(axis=1)).sum(axis=0)
#bigram_freq.sum(axis=1)[CHAR_TO_IX['q']]
#bigram_freq[CHAR_TO_IX['q'], :].sum()
#(bigram_freq / bigram_freq.sum(axis=1)[:, np.newaxis]).cumsum(axis=1)
#Letter relative frequency for letters following 'q'
[ (CHARS_VALID[i], "%6.3f" % f) for i,f in enumerate(bigram_freq[ CHAR_TO_IX['q'] ].tolist()) if f>0.001]
#bigram_freq_cum[4]

In [13]:
def bigram_word():
    s=[]
    idx_last = CHAR_TO_IX[' ']
    while True:
        idx = np.searchsorted(bigram_freq_cum[idx_last], np.random.uniform())
        c = IX_TO_CHAR[idx]
        if c==' ':
            if len(s)>0:
                #if len(s)<50: continue
                break
            else:
                continue
        s.append(c)
        idx_last=idx
    return ''.join(s)
' '.join([ bigram_word() for i in range(0,20) ])

###  Trigram frequency distribution 

In [14]:
# Three-letter frequencies
trigram_freq = np.zeros( (CHARS_SIZE,CHARS_SIZE,CHARS_SIZE) )
for i,w in enumerate(words):
    w3 = '  '+w+'  '
    word_freq = freqs[i]
    for j in range(0, len(w3)-2):
        trigram_freq[ CHAR_TO_IX[ w3[j] ], CHAR_TO_IX[ w3[j+1] ], CHAR_TO_IX[ w3[j+2] ] ] += word_freq
trigram_freq /= trigram_freq.sum(axis=2)[:, :, np.newaxis] # Trick to enable unflattening of sum()
trigram_freq_cum = trigram_freq.cumsum(axis=2)
[ "ex-%s %6.3f" % (CHARS_VALID[i], f) 
    for i,f in enumerate(trigram_freq[ CHAR_TO_IX['e'],  CHAR_TO_IX['x'] ].tolist()) if f>0.001 ]

In [15]:
def trigram_word():
    s=[]
    idx_1 = idx_2 = CHAR_TO_IX[' ']
    while True:
        idx = np.searchsorted(trigram_freq_cum[idx_1, idx_2], np.random.uniform())
        c = IX_TO_CHAR[idx]
        if c==' ':
            if len(s)>0:
                #if len(s)<50: continue
                break
            else:
                continue
        s.append(c)
        idx_1, idx_2 = idx_2, idx
    return ''.join(s) 
' '.join([ trigram_word() for i in range(0,20) ])

### Generate base-line scores

In [16]:
sample_size=10000
ngram_hits = [0,0,0]
for w in [ unigram_word() for i in range(0, sample_size) ]:
    if w in wordset: ngram_hits[0] += 1
    #print("%s %s" % (("YES" if w in wordset else " - "), w, ))
for w in [ bigram_word() for i in range(0, sample_size) ]:
    if w in wordset: ngram_hits[1] += 1
    #print("%s %s" % (("YES" if w in wordset else " - "), w, ))
for w in [ trigram_word() for i in range(0, sample_size) ]:
    if w in wordset: ngram_hits[2] += 1
    #print("%s %s" % (("YES" if w in wordset else " - "), w, ))
for i,hits in enumerate(ngram_hits):
    print("%d-gram : %4.2f%%"  % (i+1, hits*100./sample_size ))
#[ (i,w) for i,w in enumerate(words) if 'mq' in w]

In [None]:
# This just checks the distribution of unigrams
if False:
    sample_size=1000
    arr=[]
    for w in [ unigram_word() for i in range(0, sample_size) ]:
        arr.append(w)
    s = ' '.join(arr)
    s_len = len(s)
    for c in CHARS_VALID:
        f = len(s.split(c))-1
        print("%s -> %6.3f%%" % (c, f*100./s_len))

### RNN Main Parameters

In [66]:
SEQUENCE_LENGTH_MAX = 16
BATCH_SIZE = 64
RNN_HIDDEN_SIZE = CHARS_SIZE

GRAD_CLIP_BOUND = 5.0

## An RNN 'discriminator'

Instead of having a binary 'YES/NO' decision about whether a word is valid (via a lookup in the vocabulary), it may make it simpler to train a word-generator if we can assign a probability that a given word is valid.  

To do this, let's create a recurrent neural network (RNN) that accepts a (one-hot-encoded) word as input, and (at the end of the sequence) gives us an estimate of the probability that the word is valid.  

Actually, rather than descriminate according to whether the word is *actually* valid, let's 'just' try to decide whether it was produced directly from the dictionary or from the ```generate_bigram_word()``` source.

This can be tested by giving it lists of actual words, and lists of words generated by ```generate_bigram_word()``` and seeing whether they can be correctly classified.  

The decision about what to do in the 12% of cases when the bigram function results in a valid word can be left until later...  (since the distribution is so heavily skewed towards producing non-words).

### Create Training / Testing dataset
And a 'batch generator' function that delivers data in the right format for RNN training

In [67]:
def batch_generator_dictionary(size=BATCH_SIZE/2):
    while True:
        uniform_vars = np.random.uniform( size=(size,) )
        idx = freqs_cum.searchsorted(uniform_vars)
        yield wordsnp[ idx ]
    
def batch_generator_bigram(size=BATCH_SIZE/2):
    while True:
        yield np.array([ bigram_word() for i in range(size) ])

In [68]:
## Our batch generator will yield BATCH_SIZE/2 words, with their correct classification
#def data_batch_generator(corpus, size=BATCH_SIZE):
#    startidx = np.random.randint(0, len(corpus) - SEQUENCE_LENGTH - 1, size=size)
#
#    while True:
#        items = np.array([corpus[start:start + SEQUENCE_LENGTH + 1] for start in startidx])
#        startidx = (startidx + SEQUENCE_LENGTH) % (len(corpus) - SEQUENCE_LENGTH - 1)
#        yield items

In [69]:
# Test it out
#gen = batch_generator_dictionary(size=4)
gen = batch_generator_bigram(size=4)
print(next(gen))
print(next(gen))
print(next(gen))

#### Lasagne RNN tutorial (including conventions &amp; rationale)

*  http://colinraffel.com/talks/hammer2015recurrent.pdf

#### Lasagne Examples

*  https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/recurrent.py
*  https://github.com/Lasagne/Recipes/blob/master/examples/lstm_text_generation.py

#### Good blog post series

*  http://www.wildml.com/2015/10/recurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano/

In [70]:
# After sampling a data batch, we transform it into a one hot feature representation
# and create a target sequence by shifting by one character
#def prep_batch_for_network(batch):
#    x_seq = np.zeros((len(batch), SEQUENCE_LENGTH, VOCAB_SIZE), dtype='float32')
#    y_seq = np.zeros((len(batch), SEQUENCE_LENGTH), dtype='int32')
#
#    for i, item in enumerate(batch):
#        for j in range(SEQUENCE_LENGTH):
#            x_seq[i, j] = CHAR_TO_ONEHOT[ item[j] ]
#            #x_seq[i, j, :] = CHAR_TO_ONEHOT[ item[j] ]
#            y_seq[i, j] = CHAR_TO_IX[ item[j+1] ]
#
#    return x_seq, y_seq

### Define the Descriminating Network Symbolically

In [71]:
# Symbolic variables for input. In addition to the usual features and target,
# we need initial values for the RNN layer's hidden states
disc_input_sym = theano.tensor.tensor3()
disc_mask_sym  = theano.tensor.matrix()

#disc_rnn1_t0_sym = theano.tensor.matrix()

disc_target_sym = theano.tensor.vector()  # a list of probabilities of being from the dictionary

In [81]:
# Our network has two stacked GRU layers processing the input sequence.
disc_input = lasagne.layers.InputLayer( (None, None, CHARS_SIZE) )  # batch_size, sequence_len, chars_size
disc_mask  = lasagne.layers.InputLayer( (None, None, CHARS_SIZE) )  # batch_size, sequence_len, chars_size

#disc_rnn1_t0 = lasagne.layers.InputLayer( (None, RNN_HIDDEN_SIZE) )  # batch_size, RNN_hidden_size=chars_size
#l_input_hid2 = lasagne.layers.InputLayer((None, RNN_HIDDEN_SIZE))

disc_rnn1 = lasagne.layers.GRULayer(disc_input,
                                      num_units=RNN_HIDDEN_SIZE,
                                      gradient_steps=-1,
                                      grad_clipping=GRAD_CLIP_BOUND,
                                      #hid_init=disc_rnn1_t0,
                                      learn_init=True,
                                      mask_input=disc_mask,
                                      only_return_final=True, # Only the state at the last timestep is needed
                                    )

#l_rnn2 = lasagne.layers.GRULayer(l_rnn,
#                                  num_units=RNN_HIDDEN_SIZE,
#                                  grad_clipping=5.,
#                                  hid_init=l_input_hid2,
#                                  #learn_init=True,
#                                  )

# Before the decoder layer, we need to reshape the sequence into the batch dimension,
# so that timesteps are decoded independently.
#l_shp = lasagne.layers.ReshapeLayer(l_rnn2, (-1, RNN_HIDDEN_SIZE))

disc_decoder = lasagne.layers.DenseLayer(disc_rnn1,
                   num_units=1,
                   nonlinearity=lasagne.nonlinearities.sigmoid
               )

#disc_out = lasagne.layers.ReshapeLayer(disc_decoder, (-1, SEQUENCE_LENGTH, VOCAB_SIZE))
#disc_final_out = disc_decoder

In [82]:
# Finally, the output stage
disc_out = lasagne.layers.get_output(disc_decoder, {
                             disc_input: disc_input_sym, 
                             disc_mask: disc_mask_sym, 
                             #disc_rnn1_t0: disc_rnn1_t0_sym,
                            }
                        )

#disc_rnn1_out_last  = disc_rnn1_out[:, -1]
#hid2_out_last = hid2_out[:, -1]

### Loss Function for Training

In [83]:
## We flatten the sequence into the batch dimension before calculating the loss
#def calc_cross_ent(net_output, targets):
#    preds = T.reshape(net_output, (-1, VOCAB_SIZE))
#    targets = T.flatten(targets)
#    cost = T.nnet.categorical_crossentropy(preds, targets)
#    return cost
#
#loss = T.mean(calc_cross_ent(prob_out, y_sym))
disc_loss = theano.tensor.nnet.binary_crossentropy(disc_out, disc_target_sym)

In [84]:
# For stability during training, gradients are clipped and a total gradient norm constraint is also used
#MAX_GRAD_NORM = 15

disc_params = lasagne.layers.get_all_params(disc_out, trainable=True)

disc_param_values = lasagne.layers.get_all_param_values(disc_out)
param_dictionary = dict(
     params = disc_param_values,
     CHARS_VALID = CHARS_VALID, 
     CHAR_TO_IX = CHAR_TO_IX,
     IX_TO_CHAR = IX_TO_CHAR,
    )

disc_grads = theano.tensor.grad(disc_loss, disc_params)
#disc_grads = [theano.tensor.clip(g, -GRAD_CLIP_BOUND, GRAD_CLIP_BOUND) for g in disc_grads]
#disc_grads, disc_norm = lasagne.updates.total_norm_constraint( disc_grads, MAX_GRAD_NORM, return_norm=True)

disc_updates = lasagne.updates.adam(disc_grads, disc_params) # , learning_rate=0.002)

disc_train = theano.function([disc_input_sym, disc_mask_sym],  # , disc_rnn1_t0_sym
                          [disc_loss, disc_out],  #  norm, hid_out_last, hid2_out_last
                          updates=disc_updates,
                         )

disc_val = theano.function([disc_input_sym, disc_mask_sym], [disc_loss, disc_out])

### Finally, the Training Loop

Training takes a while :: 100 iteration takes about 2 minutes on a CPU (so, overall, it could be HOURS without a GPU)

... you may want to skip this and the next cell, and load the pretrained weights instead

In [None]:
hid = np.zeros((BATCH_SIZE, RNN_HIDDEN_SIZE), dtype='float32')
hid2 = np.zeros((BATCH_SIZE, RNN_HIDDEN_SIZE), dtype='float32')

train_batch_gen = data_batch_generator(train_corpus)

for iteration in range(2*100*100):
    x, y = prep_batch_for_network(next(train_batch_gen))
    #print(iteration, np.shape(x), np.shape(y), np.shape(hid), np.shape(hid2))
    loss_train, norm, hid, hid2 = f_train(x, y, hid, hid2)
    
    if iteration % 100 == 0:
        print('Iteration {}, loss_train: {}, norm: {}'.format(iteration, loss_train, norm))

### Save the learned parameters

Uncomment the ```pickle.dump()``` to actually save to disk

In [None]:
#pickle.dump(param_dictionary, open('./data/RNN/gru_2layer_trained.pkl','w'), protocol=pickle.HIGHEST_PROTOCOL)

### Load pretrained weights into network

In [None]:
param_dictionary = pickle.load(open('./data/RNN/gru_2layer_trained_claims.pkl', 'r'))
lasagne.layers.set_all_param_values(l_out, param_dictionary['param values'])

### Produce a Validation Score

In [None]:
predict_fn = theano.function([x_sym, hid_init_sym, hid2_init_sym], [prob_out, hid_out_last, hid2_out_last])

In [None]:
# Calculate validation loss (this takes a minute or so on a CPU)
hid = np.zeros((BATCH_SIZE, RNN_HIDDEN_SIZE), dtype='float32')
hid2 = np.zeros((BATCH_SIZE, RNN_HIDDEN_SIZE), dtype='float32')

val_batch_gen = data_batch_generator(val_corpus)

losses = []

for iteration in range(50):
    x, y = prep_batch_for_network(next(val_batch_gen))
    #print(iteration, np.shape(x), np.shape(y), np.shape(hid), np.shape(hid2))
    loss_val, hid, hid2 = f_val(x, y, hid, hid2)
    losses.append(loss_val)
    
print(np.mean(losses))  # Preloaded data gives a result of 0.89385

### Create a network that's optimised to *produce* text

For faster sampling, we rebuild the network with a sequence length of 1

In [None]:
l_input = lasagne.layers.InputLayer((None, 1, VOCAB_SIZE))
l_input_hid = lasagne.layers.InputLayer((None, RNN_HIDDEN_SIZE))
l_input_hid2 = lasagne.layers.InputLayer((None, RNN_HIDDEN_SIZE))

l_rnn = lasagne.layers.GRULayer(l_input,
                                  num_units=RNN_HIDDEN_SIZE,
                                  grad_clipping=5.,
                                  hid_init=l_input_hid,
                                  )

l_rnn2 = lasagne.layers.GRULayer(l_rnn,
                                  num_units=RNN_HIDDEN_SIZE,
                                  grad_clipping=5.,
                                  hid_init=l_input_hid2,
                                  )


l_shp = lasagne.layers.ReshapeLayer(l_rnn2, (-1, RNN_HIDDEN_SIZE))

l_decoder = lasagne.layers.DenseLayer(l_shp,
                                      num_units=VOCAB_SIZE,
                                      nonlinearity=lasagne.nonlinearities.softmax)

l_out = lasagne.layers.ReshapeLayer(l_decoder, (-1, 1, VOCAB_SIZE))

hid_out, hid2_out, prob_out = lasagne.layers.get_output([l_rnn, l_rnn2, l_out], {
                                                         l_input: x_sym,
                                                         l_input_hid: hid_init_sym,
                                                         l_input_hid2: hid2_init_sym,
                                                        })
hid_out_last  = hid_out[:, -1]
hid2_out_last = hid2_out[:, -1]
prob_out_last = prob_out[0, -1]

In [None]:
lasagne.layers.set_all_param_values(l_out, param_dictionary['param values'])

In [None]:
predict_fn = theano.function([x_sym, hid_init_sym, hid2_init_sym], [prob_out_last, hid_out_last, hid2_out_last])

### Finally, Produce some text
We will use random sentences from the validation corpus to 'prime' the network

In [None]:
primers = val_corpus.split('\n')

We feed characters one at a time from the priming sequence into the network.

To obtain a sample string, at each timestep we sample from the output probability distribution, and feed the chosen character back into the network. We terminate after the first linebreak.

In [None]:
sentence = ''
hid = np.zeros((1, RNN_HIDDEN_SIZE), dtype='float32')
hid2 = np.zeros((1, RNN_HIDDEN_SIZE), dtype='float32')
x = np.zeros((1, 1, VOCAB_SIZE), dtype='float32')

primer = np.random.choice(primers) + '\n'

for c in primer:
    p, hid, hid2 = predict_fn(x, hid, hid2)
    x[0, 0, :] = CHAR_TO_ONEHOT[c]
    
for _ in range(500):
    p, hid, hid2 = predict_fn(x, hid, hid2)
    p = p/(1 + 1e-6)
    s = np.random.multinomial(1, p)
    sentence += IX_TO_CHAR[s.argmax(-1)]
    x[0, 0, :] = s
    if sentence[-1] == '\n':
        break
        
print('PRIMER: ' + primer)
print('GENERATED: ' + sentence)

Exercises
=====

1. Implement sampling using the "temperature softmax": $$p(i) = \frac{e^{\frac{z_i}{T}}}{\Sigma_k e^{\frac{z_k}{T}}}$$

This generalizes the softmax with a parameter $T$ which affects the "sharpness" of the distribution. Lowering $T$ will make samples less error-prone but more repetitive. 

In [None]:
# Uncomment and run this cell for a solution
#%load spoilers/tempsoftmax.py