# RNN Tagger

This example trains a RNN to tag words from a corpus - 

The data used for training is from a Wikipedia download, which is the artificially annotated with parts of speech by the NLTK PoS tagger written by Matthew Honnibal.


Adversarial networks : http://carpedm20.github.io/faces/

Doing this with RNNs may be pretty novel : https://www.quora.com/Can-generative-adversarial-networks-be-used-in-sequential-data-in-recurrent-neural-networks-How-effective-would-they-be

In [None]:
import numpy as np
import theano
import lasagne

#import pickle
#import gzip
import random

import time

SENTENCE_LENGTH_MAX = 32
EMBEDDING_DIM=64

## Basic Text and Parsing Tools

In [None]:
import nltk

In [None]:
sentence_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
sentence_splitter.tokenize("This is Mr. Smith's tokenize test. The U.S.A gives us sent two. Is this sent three?")

In [None]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("This is Mr. Smith's tokenize test.")

In [None]:
# An interesting corpus :
corpus_text_file = '../data/RNN/en.wikipedia.2010.100K.txt'

In [None]:
def corpus_sentence_tokens(corpus_text_file=corpus_text_file):
    while True:
        with open(corpus_text_file, 'rt') as f:
            for line in f.readlines():
                #print(line)
                n,l = line.decode("utf-8").split('\t')   # Strip of the initial numbers
                for s in sentence_splitter.tokenize(l):  # Split the lines into sentences (~1 each)
                    yield tokenizer.tokenize(s)
        print("Corpus : Looping")
corpus_sentence_tokens_gen = corpus_sentence_tokens()

In [None]:
' | '.join(next(corpus_sentence_tokens_gen))

## Reference Tagger

In [None]:
from nltk.tag.perceptron import PerceptronTagger
pos_tagger = PerceptronTagger(load=True)
' | '.join(list(pos_tagger.classes))

In [None]:
s = "Do part of speech on this sample text .".split(' ')
#s = next(corpus_sentence_tokens_gen)
pos_tagger.tag(s)

### Twist : Not interested in all classes...

To simplify (dramatically), let's just look at 'is ordinary word' and 'is entity name'.

In [None]:
tag_list = 'O E'.split(' ')
pos_tagger_entity_tags = set('NNP'.split(' '))
pos_tagger_to_idx   = dict([ (t,(1 if t in pos_tagger_entity_tags else 0)) for i,t in enumerate(pos_tagger.classes)])
TAG_SET_SIZE= len(tag_list)

pos_tagger_to_idx['NNP'], pos_tagger_to_idx['VBP']

## GloVe Word Embeddings
Using the python package :  https://github.com/maciejkula/glove-python , and code samples from : http://developers.lyst.com/2014/11/11/word-embeddings-for-fashion/

### Create the Co-occurrence Matrix
This should take 30 seconds or so.

In [None]:
import glove
corpus_sentences = [ 
        [ w.lower() for w in next(corpus_sentence_tokens_gen)] # All lower-case
        for _ in range(0,100*1000) 
    ]

# Fit the co-occurrence matrix using a sliding window of 10 words.
glove_corpus = glove.Corpus()

t0 = time.time()
glove_corpus.fit(corpus_sentences, window=10)
print("Co-occurrence calculated in %5.1fsec" % (time.time()-t0, ))

In [None]:
# Return the index of the word in the dictionary
glove_corpus.dictionary['city']

###  Create the Word Embedding

This will make use of up to 4 threads - and each epoch takes 20-30 seconds on a single core.

In [None]:
word_embedding = glove.Glove(no_components=EMBEDDING_DIM, learning_rate=0.05)
t0 = time.time()
glove_epochs, glove_threads = 20, 4 
word_embedding.fit(glove_corpus.matrix, epochs=glove_epochs, no_threads=glove_threads, verbose=True)
print("Word-embedding created in %5.1fsec = %5.1fsec per epoch" % (
        (time.time()-t0), (time.time()-t0)/glove_epochs*glove_threads, ))

# Add the word -> id dictionary to the model to allow similarity queries.
word_embedding.add_dictionary(glove_corpus.dictionary)

In [None]:
word_embedding.save("../data/RNN/glove.embedding.64.pkl")

In [None]:
## Alternative :: Load a pre-trained word embedding
#word_embedding = glove.Glove.load("../data/RNN/glove.embedding.64.pretrained.pkl")

###  Test Word Embedding


In [None]:
# word-similarity test
word_embedding.most_similar('island')

In [None]:
# word-analogy test
def get_embedding_vec(word):
    idx = glove_corpus.dictionary[word.lower()]
    return word_embedding.word_vectors[idx]

def get_closest_word(vec, number=4):
    dst = (np.dot(word_embedding.word_vectors, vec)
                   / np.linalg.norm(word_embedding.word_vectors, axis=1)
                   / np.linalg.norm(vec))
    word_ids = np.argsort(-dst)
    return [(word_embedding.inverse_dictionary[x], dst[x]) for x in word_ids[:number]
            if x in word_embedding.inverse_dictionary]

In [None]:
analogy_vec = get_embedding_vec('man') - get_embedding_vec('woman') + get_embedding_vec('king')
#analogy_vec = get_embedding_vec('paris') - get_embedding_vec('france') + get_embedding_vec('italy')
#analogy_vec = get_embedding_vec('laugh') - get_embedding_vec('happy') + get_embedding_vec('sad')
#analogy_vec = get_embedding_vec('kitten') - get_embedding_vec('cat') + get_embedding_vec('dog')
get_closest_word(analogy_vec)

###  Trigram frequency distribution 

### Generate base-line scores

### RNN Main Parameters

In [None]:
BATCH_SIZE = 64
RNN_HIDDEN_SIZE = EMBEDDING_DIM + 1  # 1 for capitalisation flag
GRAD_CLIP_BOUND = 5.0

## An RNN Part-of-Speech Tagger


### Create Training / Testing dataset
And a 'batch generator' function that delivers data in the right format for RNN training

In [None]:
## TODO!

def batch_dictionary(size=BATCH_SIZE/2):
    uniform_vars = np.random.uniform( size=(size,) )
    idx = freqs_cum.searchsorted(uniform_vars)
    return wordsnp[ idx ].tolist()
    
def batch_bigram(size=BATCH_SIZE/2):
    return [ bigram_word()[0:WORD_LENGTH_MAX] for i in range(size) ]

In [None]:
## Our batch generator will yield BATCH_SIZE/2 words, with their correct classification
#def data_batch_generator(corpus, size=BATCH_SIZE):
#    startidx = np.random.randint(0, len(corpus) - SEQUENCE_LENGTH - 1, size=size)
#
#    while True:
#        items = np.array([corpus[start:start + SEQUENCE_LENGTH + 1] for start in startidx])
#        startidx = (startidx + SEQUENCE_LENGTH) % (len(corpus) - SEQUENCE_LENGTH - 1)
#        yield items

In [None]:
# Test it out
#batch_test = lambda : batch_dictionary(size=4)
batch_test = lambda : batch_bigram(size=4)
print(batch_test())
print(batch_test())
print(batch_test())

#### Lasagne RNN tutorial (including conventions &amp; rationale)

*  http://colinraffel.com/talks/hammer2015recurrent.pdf

#### Lasagne Examples

*  https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/recurrent.py
*  https://github.com/Lasagne/Recipes/blob/master/examples/lstm_text_generation.py

#### Good blog post series

*  http://www.wildml.com/2015/10/recurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano/

In [None]:
# After sampling a data batch, we transform it into a one hot feature representation with a mask
def prep_batch_for_network(batch_of_words):
    word_max_length = np.array( [ len(w) for w in batch_of_words ]).max()
    
    # translate into one-hot matrix, mask values and targets
    input_values = np.zeros((len(batch_of_words), word_max_length, CHARS_SIZE), dtype='float32')
    mask_values  = np.zeros((len(batch_of_words), word_max_length), dtype='float32')
    
    for i, word in enumerate(batch_of_words):
      for j, c in enumerate(word):
        input_values[i,j] = CHAR_TO_ONEHOT[ c ]
      mask_values[i, 0:len(word) ] = 1.

    return input_values, mask_values

### Define the RNN Symbolically

In [None]:
# Symbolic variables for input. In addition to the usual features and target,
rnn_input_sym = theano.tensor.tensor3()
rnn_mask_sym  = theano.tensor.matrix()

rnn_words_target_sym = theano.tensor.imatrix() # part-of-speech generated

In [None]:
rnn_input = lasagne.layers.InputLayer( (None, None, RNN_HIDDEN_SIZE) )  # batch_size, sequence_len, embedding_dim
rnn_mask  = lasagne.layers.InputLayer( (None, None, RNN_HIDDEN_SIZE) )  # batch_size, sequence_len, embedding_dim

n_batch, n_time_steps, n_features = rnn_input_sym.shape

rnn_layer_f = lasagne.layers.GRULayer(rnn_input,
                num_units=RNN_HIDDEN_SIZE,
                gradient_steps=-1,
                grad_clipping=GRAD_CLIP_BOUND,
                hid_init=lasagne.init.Normal(),
                learn_init=True,
                mask_input=rnn_mask,
                only_return_final=False, # Need all of the output states
            )

rnn_layer_b = lasagne.layers.GRULayer(rnn_input,
                num_units=RNN_HIDDEN_SIZE,
                gradient_steps=-1,
                grad_clipping=GRAD_CLIP_BOUND,
                hid_init=lasagne.init.Normal(),
                learn_init=True,
                mask_input=rnn_mask,
                only_return_final=False, # Need all of the output states
                backwards=True,
            )

# Before the decoder layer, we need to reshape the sequence into the batch dimension,
# so that timesteps are decoded independently.
rnn_reshape_f = lasagne.layers.ReshapeLayer(rnn_layer_f, (-1, RNN_HIDDEN_SIZE) )
rnn_reshape_b = lasagne.layers.ReshapeLayer(rnn_layer_b, (-1, RNN_HIDDEN_SIZE) )

# Now concatenate them
rnn_concat = lasagne.layers.ConcatLayer([rnn_reshape_f, rnn_reshape_b])

# Convert them into softmax outputs
rnn_tags = lasagne.layers.DenseLayer( rnn_concat, num_units=TAG_SET_SIZE, nonlinearity=lasagne.nonlinearities.softmax)

# And reshape them, so that they are in the original batches-of-sentences shape
rnn_out = lasagne.layers.ReshapeLayer(rnn_tags, (-1, n_time_steps, RNN_HIDDEN_SIZE))

### Loss Function for Training

In [None]:
# Finally, the output stage - this is for the training (over all the letters in the words)
rnn_output = lasagne.layers.get_output(rnn_out, 
                {
                 rnn_input: rnn_input_sym, 
                 rnn_mask: rnn_mask_sym, 
                }
            )

# We flatten the sequence into the batch dimension before calculating the loss
def rnn_word_cross_ent(net_output, targets):
    preds = theano.tensor.reshape(net_output, (-1, TAGS_SIZE))
    targets_flat = theano.tensor.flatten(targets)
    cost = theano.tensor.nnet.categorical_crossentropy(preds, targets_flat)
    return cost

rnn_loss = rnn_word_cross_ent(rnn_output, rnn_words_target_sym).mean()

lasagne.objectives.categorical_crossentropy


### ... and the Training and Prediction functions

In [None]:
# For stability during training, gradients are clipped and a total gradient norm constraint is also used
#MAX_GRAD_NORM = 15

rnn_params = lasagne.layers.get_all_params(rnn_out, trainable=True)

rnn_grads = theano.tensor.grad(rnn_loss, rnn_params)
#rnn_grads = [theano.tensor.clip(g, -GRAD_CLIP_BOUND, GRAD_CLIP_BOUND) for g in rnn_grads]
#rnn_grads, rnn_norm = lasagne.updates.total_norm_constraint( rnn_grads, MAX_GRAD_NORM, return_norm=True)

rnn_updates = lasagne.updates.adam(rnn_grads, rnn_params)

rnn_train = theano.function([rnn_input_sym, rnn_words_target_sym, rnn_mask_sym],
                [rnn_loss],
                updates=rnn_updates,
            )

rnn_predict = theano.function([rnn_input_sym, rnn_mask_sym], [rnn_output])

In [None]:
## TODO :: Add the training stuff here

### Save the learned parameters

Uncomment the ```pickle.dump()``` to actually save to disk

In [None]:
rnn_param_values = lasagne.layers.get_all_param_values(disc_final)
rnn_param_dictionary = dict(
     params = rnn_param_values,
     CHARS_VALID = CHARS_VALID, 
     CHAR_TO_IX = CHAR_TO_IX,
     IX_TO_CHAR = IX_TO_CHAR,
    )
#pickle.dump(rnn_param_dictionary, open('../data/RNN/tagger_rnn_trained.pkl','w'), protocol=pickle.HIGHEST_PROTOCOL)

### Load pretrained weights into network

In [None]:
rnn_param_dictionary = pickle.load(open('../data/RNN/tagger_rnn_trained_64x310k.pkl', 'r'))
lasagne.layers.set_all_param_values(rnn_out, rnn_param_dictionary['params'])

### Check that the Tagger Network 'works'

In [None]:
test_text_list = ["shape", "shast", "shaes", "shafg", "shaqw"]
test_text_list = ["opposite", "aposite", "apposite", "xposite", "rrwqsite", "deposit", "idilic", "idyllic"]

In [None]:
disc_input_values, disc_mask_values = prep_batch_for_network(test_text_list)

disc_output_, = disc_predict(disc_input_values, disc_mask_values)

for i,v in enumerate(disc_output_.tolist()):
    print("%s : %5.2f%%" % ((test_text_list[i]+' '*20)[:20], v[0]*100.))

#  NOTHING BELOW IS USED

### Use the Generative Network to create sample words

The network above can be used to generate text...

The following set-up allows for the output of the RNN at each timestep to be mixed with the letter frequency that the bigram model would suggest - in a proportion ```bigram_overlay``` which can vary from ```0``` (being solely RNN derived) to ```1.0``` (being solely bigram frequencies, with the RNN output being disregarded). 

The input is a 'random field' matrix that is used to chose each letter in each slot from the generated probability distribution.

Once a space is output for a specific word, then it stops being extended (equivalently, the mask is set to zero going forwards).

Once spaces have been observed for all words (or the maximum length reached), the process ends, and a list of the created words is returned.

In [None]:
# Let's pre-calculate the logs of the bigram frequencies, since they may be mixed in below
bigram_min_freq = 1e-10 # To prevent underflow in log...
bigram_freq_log = np.log( bigram_freq + bigram_min_freq )

In [None]:
def generate_rnn_words(random_field, bigram_overlay=0.0):
    batch_size, max_word_length  = random_field.shape
    
    idx_spc = CHAR_TO_IX[' ']
    def append_indices_as_chars(words_current, idx_list):
        for i, idx in enumerate(idx_list):
            if idx == idx_spc:
                pass # Words end at space
                #words_current[i] += 'x'
            else:
                words_current[i] += IX_TO_CHAR[idx]
        return words_current
    
    # Create a 'first character' by using the bigram transitions from 'space' (this is fair)
    idx_initial = [ np.searchsorted(bigram_freq_cum[idx_spc], random_field[i, 0]) for i in range(batch_size) ]
    bigram_freq_log_current = bigram_freq_log[ np.array(idx_initial) , :]
    
    words_current = [ '' for _ in range(batch_size) ]
    words_current = append_indices_as_chars(words_current, idx_initial)
    
    
    col, finished=1, False
    while not finished:
        gen_input_values, gen_mask_values = prep_batch_for_network(words_current)
        #print(gen_mask_values[:,-1])
        gen_out_, = gen_predict(gen_input_values, gen_mask_values)
        
        gen_freq = np.exp( # This is going to look like softmax...
            gen_out_*(1.0-bigram_overlay) + bigram_freq_log_current*bigram_overlay 
        )  
        
        # This output is the final probability[CHARS_SIZE], so let's cumsum it, etc.
        gen_prob = gen_freq / gen_freq.sum(axis=1)[:, np.newaxis] # Trick to enable unflattening of sum()
        gen_prob_cum = gen_prob.cumsum(axis=1)
        
        idx_next = [ # Only add extra letters if we haven't already passed a space (i.e. mask[-1]==0)
            idx_spc if gen_mask_values[i,-1]==0 else np.searchsorted(gen_prob_cum[i], random_field[i, col]) 
            for i in range(batch_size) 
        ]
        
        bigram_freq_log_current = bigram_freq_log[ np.array(idx_next) , :]
        words_current = append_indices_as_chars(words_current, idx_next)
        
        words_current_max_length = np.array( [ len(w) for w in words_current ]).max()
        
        # If the words have reached the maximum length, or we didn't extend any of them...
        if words_current_max_length>=max_word_length or words_current_max_length<col:
            finished=True 
        
        col += 1

    return words_current
        
# Create a probability distribution across all potential positions in the output 'field'
random_field = np.random.uniform( size=(BATCH_SIZE, WORD_LENGTH_MAX) )

gen_words_output = generate_rnn_words(random_field, bigram_overlay=0.0)

print( '\n'.join(gen_words_output))    

#### Save the initial (random) Network State 

This will come in handy when we need to reset the network back to 'untrained' later.

In [None]:
gen_param_values_initial = lasagne.layers.get_all_param_values(gen_prob)

### Now, train the Generator RNN based on the Dictionary itself

Once we have an output word, let's reward the RNN based on a specific training signal.  We'll encapsulate the training in a function that takes the input signal as a parameter, so that we can try other training schemes (later).

In [None]:
def is_good_output_dictionary(output_words):
    return np.array(
        [ (1.0 if w in wordset else 0.0) for w in output_words ],
        dtype='float32'
    )

t0, iterations_complete = time.time(), 0
def reset_generative_network():
    global t0, iterations_complete
    t0, iterations_complete = time.time(), 0
    lasagne.layers.set_all_param_values(gen_prob, gen_param_values_initial)

def prep_batch_for_network_output(mask_values, batch_of_words):
    output_indices = np.zeros(mask_values.shape, dtype='int32')

    for i, word in enumerate(batch_of_words):
      word_shifted = word[1:]+' '
      for j, c in enumerate(word_shifted):
        output_indices[i,j] = CHAR_TO_IX[ c ]

    return output_indices
    
def train_generative_network(is_good_output_function=is_good_output_dictionary, epochs=10*1000, bigram_overlay=0.0):
    global t0, iterations_complete
    t1, iterations_recent = time.time(), iterations_complete
    for epoch_i in range(epochs):
        random_field = np.random.uniform( size=(BATCH_SIZE, WORD_LENGTH_MAX) )

        gen_words_output = generate_rnn_words(random_field, bigram_overlay=bigram_overlay)
        
        # So, now we have a set of words, convert to 'goodness'
        is_good_output = is_good_output_function(gen_words_output)
        
        # Now, create a training set of input -> output, coupled with an intensity signal
        #   first the step-by-step network inputs
        gen_input_values, gen_mask_values = prep_batch_for_network(gen_words_output)
        
        #   now create step-by-step network outputs (strip off first character, add spaces)
        
        #gen_output_values, _ = prep_batch_for_network([ (w[1:]+' ') for w in gen_words_output ])
        #gen_output_values_int = np.array(gen_output_values, dtype='int32')
        
        gen_output_values_int = prep_batch_for_network_output(gen_mask_values, gen_words_output)
        
        #  now create step-by-step network outputs (strip off first character, add spaces) as *indicies*
        
        #   and the 'error signal' is simply (+ve for good, -ve for bad):
        target_valid = (np.array(is_good_output) - 0.5).reshape( (len(gen_words_output),1) )
        
        # Now train the generator RNN
        gen_loss_, = gen_train(gen_input_values, gen_output_values_int, target_valid, gen_mask_values)

        iterations_complete += 1

        if iterations_complete % 10 == 0:
            secs_per_batch = float(time.time() - t1)/ (iterations_complete - iterations_recent)
            eta_in_secs = secs_per_batch*(epochs-epoch_i)
            print("Iteration {:5d}, loss_train: {:.4f} ({:.1f}s per 1000 batches)  eta: {:.0f}m{:02.0f}s".format(
                    iterations_complete, float(disc_loss_), 
                    secs_per_batch*1000., eta_in_secs/60, eta_in_secs % 60) 
                 )
            #print('Iteration {}, output: {}'.format(iteration, disc_output_, ))  # , output: {}
            t1, iterations_recent = time.time(), iterations_complete

    print('Iteration {}, ran in {:.1f}sec'.format(iterations_complete, float(time.time() - t0)))


In [None]:
reset_generative_network()
train_generative_network(is_good_output_function=is_good_output_dictionary, epochs=10*1000, bigram_overlay=0.0)

### Finally, Produce some text
We will use random sentences from the validation corpus to 'prime' the network

In [None]:
primers = val_corpus.split('\n')

We feed characters one at a time from the priming sequence into the network.

To obtain a sample string, at each timestep we sample from the output probability distribution, and feed the chosen character back into the network. We terminate after the first linebreak.

In [None]:
sentence = ''
hid = np.zeros((1, RNN_HIDDEN_SIZE), dtype='float32')
hid2 = np.zeros((1, RNN_HIDDEN_SIZE), dtype='float32')
x = np.zeros((1, 1, VOCAB_SIZE), dtype='float32')

primer = np.random.choice(primers) + '\n'

for c in primer:
    p, hid, hid2 = predict_fn(x, hid, hid2)
    x[0, 0, :] = CHAR_TO_ONEHOT[c]
    
for _ in range(500):
    p, hid, hid2 = predict_fn(x, hid, hid2)
    p = p/(1 + 1e-6)
    s = np.random.multinomial(1, p)
    sentence += IX_TO_CHAR[s.argmax(-1)]
    x[0, 0, :] = s
    if sentence[-1] == '\n':
        break
        
print('PRIMER: ' + primer)
print('GENERATED: ' + sentence)

Exercises
=====

1. Implement sampling using the "temperature softmax": $$p(i) = \frac{e^{\frac{z_i}{T}}}{\Sigma_k e^{\frac{z_k}{T}}}$$

This generalizes the softmax with a parameter $T$ which affects the "sharpness" of the distribution. Lowering $T$ will make samples less error-prone but more repetitive. 

In [None]:
# %load spoilers/tempsoftmax.py