# RNN Tagger

This example trains a RNN to tag words from a corpus - 

The data used for training is from a Wikipedia download, which is the artificially annotated with parts of speech by the NLTK PoS tagger written by Matthew Honnibal.


In [None]:
import tensorflow as tf
import tensorflow.contrib.keras as keras
import numpy as np

import pickle
import time

SENTENCE_LENGTH_MAX = 32
EMBEDDING_DIM=50

## Basic Text and Parsing Tools

In [None]:
import nltk

In [None]:
sentence_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
sentence_splitter.tokenize("This is Mr. Smith's tokenized test. The U.S.A gives us sent two. Is this sent three?")

In [None]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize("This is Mr. Smith's tokenized test.")

In [None]:
# An interesting corpus (TODO : provide a simple downloader) :
corpus_text_file = './data/RNN/en.wikipedia.2010.100K.txt'

In [None]:
def corpus_sentence_tokens(corpus_text_file=corpus_text_file):
    while True:
        with open(corpus_text_file, encoding='utf-8') as f:
            for line in f.readlines():
                # print(line)
                # .decode("utf-8")
                n,l = line.split('\t')   # Strip of the initial numbers
                for s in sentence_splitter.tokenize(l):  # Split the lines into sentences (~1 each)
                    tree_banked = tokenizer.tokenize(s)
                    if len(tree_banked) < SENTENCE_LENGTH_MAX:
                        yield tree_banked
        print("Corpus : Looping")
corpus_sentence_tokens_gen = corpus_sentence_tokens()

In [None]:
' | '.join(next(corpus_sentence_tokens_gen))

## Reference Tagger

In [None]:
from nltk.tag.perceptron import PerceptronTagger
pos_tagger = PerceptronTagger(load=True)
' | '.join(list(pos_tagger.classes))

In [None]:
s = "Let 's see what part of speech analysis on this sample text looks like .".split(' ')
#s = next(corpus_sentence_tokens_gen)
pos_tagger.tag(s)

### Twist : Not interested in all classes...

To simplify (dramatically), our RNN will be trained to just tell the difference between 'is ordinary word' and 'is entity name'.

In [None]:
tag_list = 'O E'.split(' ')
pos_tagger_entity_tags = set('NNP'.split(' '))
pos_tagger_to_idx   = dict([ (t,(1 if t in pos_tagger_entity_tags else 0)) for i,t in enumerate(pos_tagger.classes)])
TAG_SET_SIZE= len(tag_list)

pos_tagger_to_idx['NNP'], pos_tagger_to_idx['VBP']

## GloVe Word Embeddings
Using the python package :  https://github.com/maciejkula/glove-python , and code samples from : http://developers.lyst.com/2014/11/11/word-embeddings-for-fashion/

### Create the Co-occurrence Matrix
For speed, this looks at the first 100,000 tokens in the corpus - and should create the co-occurences in 30 seconds or so.

In [None]:
import glove
glove_corpus = glove.Corpus()

corpus_sentences = [ 
        [ w.lower() for w in next(corpus_sentence_tokens_gen)] # All lower-case
        for _ in range(0,100*1000) 
    ]

# Fit the co-occurrence matrix using a sliding window of 10 words.
t0 = time.time()
glove_corpus.fit(corpus_sentences, window=10)

print("Dictionary length=%d" % (len(glove_corpus.dictionary),))
print("Co-occurrence calculated in %5.1fsec" % (time.time()-t0, ))

In [None]:
# Return the index of the word in the dictionary
glove_corpus.dictionary['city']

###  Create the Word Embedding

This will make use of up to 4 threads - and each epoch takes 20-30 seconds on a single core.

In [None]:
word_embedding = glove.Glove(no_components=EMBEDDING_DIM, learning_rate=0.05)

t0 = time.time()
glove_epochs, glove_threads = 20, 4 

word_embedding.fit(glove_corpus.matrix, epochs=glove_epochs, no_threads=glove_threads, verbose=True)

print("%d-d word-embedding created in %5.1fsec = %5.1fsec per epoch" % (
        EMBEDDING_DIM, (time.time()-t0), (time.time()-t0)/glove_epochs*glove_threads, ))

# Add the word -> id dictionary to the model to allow similarity queries.
word_embedding.add_dictionary(glove_corpus.dictionary)

In [None]:
#word_embedding.save("./data/RNN/glove.embedding.50.pkl")
#word_embedding.load("./data/RNN/glove.embedding.50.pkl")

###  Test Word Embedding


In [None]:
# word-similarity test
word_embedding.most_similar('king')

In [None]:
# word-analogy test
def get_embedding_vec(word):
    idx = word_embedding.dictionary.get(word.lower(), -1)
    if idx<0:
        #print("Missing word : '%s'" % (word,))
        return np.zeros(  (EMBEDDING_DIM, ), dtype='float32')  # UNK
    return word_embedding.word_vectors[idx]

def get_closest_word(vec, number=5):
    dst = (np.dot(word_embedding.word_vectors, vec)
                   / np.linalg.norm(word_embedding.word_vectors, axis=1)
                   / np.linalg.norm(vec))
    word_ids = np.argsort(-dst)
    return [(word_embedding.inverse_dictionary[x], dst[x]) for x in word_ids[:number]
            if x in word_embedding.inverse_dictionary]

In [None]:
#analogy_vec = get_embedding_vec('woman') - get_embedding_vec('man') + get_embedding_vec('king')
analogy_vec = get_embedding_vec('paris') - get_embedding_vec('france') + get_embedding_vec('italy')
#analogy_vec = get_embedding_vec('kitten') - get_embedding_vec('cat') + get_embedding_vec('dog')
#analogy_vec = get_embedding_vec('understand') - get_embedding_vec('understood') + get_embedding_vec('ran')
get_closest_word(analogy_vec)

### Problem : Embedding is *Poor*

Solution : Load a pre-trained word embedding, from a much larger corpus.  Source of this word embedding (created from a 6 billion tokens corpus, with results as 50d vectors): http://nlp.stanford.edu/projects/glove/ 

In [None]:
# TODO: Create a downloaded / stripper etc

# Due to size constraints, only have the first 100k vectors (i.e. 100k most frequently used words)
word_embedding = glove.Glove.load_stanford("./data/RNN/glove.first-100k.6B.50d.txt")

Having loaded that, play around with the similarity and analogy tests again...

## An RNN Part-of-Speech Tagger

### RNN Main Parameters

In [None]:
BATCH_SIZE = 64
RNN_HIDDEN_SIZE = EMBEDDING_DIM # ?+1 for capitalisation flag
GRAD_CLIP_BOUND = 5.0

In [None]:
word_embedding.word_vectors.shape

In [None]:
word_embedding_rnn = np.vstack([ 
        np.zeros( (1, EMBEDDING_DIM,), dtype='float32'),   # This is the 'zero' value (used as a mask in Keras)
        np.zeros( (1, EMBEDDING_DIM,), dtype='float32'),   # This is for 'UNK'  (word == 1)
        word_embedding.word_vectors,
    ])
word_embedding_rnn.shape

### Create Training / Testing dataset
And a 'batch generator' function that delivers data in the right format for RNN training

In [None]:
#def batch_sentences(size=BATCH_SIZE):
#    return [ next(corpus_sentence_tokens_gen) for i in range(size) ]

In [None]:
# Test it out
#batch_test = lambda : batch_sentences(size=4)
#print([ ' '.join(s) for s in batch_test()])

### Synthesising a 'correct answer' for the Tagger

Normally, this would be the (manual) annotations from the corpus itself.  However, we don't have an annotated corpus.  Instead, we're going to use the annotations produced by the NTLK tagger - simplified to only identify 'NNP = entities'.

In [None]:
# After sampling a data batch, we transform it into a one hot feature representation with a mask
#def prep_batch_for_network(batch_of_sentences, include_targets=False):
#    sentence_max_length = np.array([ len(w) for w in batch_of_sentences ]).max()
#    
#    # translate into one-hot matrix, mask values and targets
#    input_values = np.zeros((len(batch_of_sentences), sentence_max_length, EMBEDDING_DIM), dtype='float32')
#    mask_values  = np.zeros((len(batch_of_sentences), sentence_max_length), dtype='float32')
#    
#    for i, sent in enumerate(batch_of_sentences):
#      for j, word in enumerate(sent):
#        input_values[i,j] = get_embedding_vec(word) # this is word.lower() in dictionary
#      mask_values[i, 0:len(sent) ] = 1.
#
#    if not include_targets:
#        return input_values, mask_values        
#    
#    target_values  = np.zeros((len(batch_of_sentences), sentence_max_length), dtype='int32')
#    for i, sent in enumerate(batch_of_sentences):
#        sentence_tags = pos_tagger.tag(sent)
#        for j, word_tag in enumerate(sentence_tags):
#            target_values[i,j] = pos_tagger_to_idx[word_tag[1]]  # tags are returned as tuples (word, tag)
#    
#    return input_values, mask_values, target_values

def word_to_idx_rnn(word):
    idx = word_embedding.dictionary.get(word.lower(), -1)  # since UNK=1 = (-1+2)
    return idx+2  # skip ahead 2 places

from tensorflow.contrib.keras.python.keras.utils.np_utils import to_categorical

def batch_for_network_generator():
    while True:
        batch_of_sentences = [ next(corpus_sentence_tokens_gen) for i in range(BATCH_SIZE) ]    
        #print("batch_for_network_generator.length = %d" % (len(batch_of_sentences),))

        input_values = np.zeros((BATCH_SIZE, SENTENCE_LENGTH_MAX), dtype='int32')

        for i, sent in enumerate(batch_of_sentences):
            for j, word in enumerate(sent):
                input_values[i,j] = word_to_idx_rnn(word)

        # Add extra dimension here to suit Keras' TimeDistributed(Dense(softmax))
        #   as discussed : https://github.com/fchollet/keras/issues/6363
        target_values  = np.zeros((BATCH_SIZE, SENTENCE_LENGTH_MAX, TAG_SET_SIZE), dtype='int32')
        for i, sent in enumerate(batch_of_sentences):
            sentence_tags = pos_tagger.tag(sent)
            for j, word_tag in enumerate(sentence_tags):
                tag = word_tag[1] # tags are returned as tuples (word, tag)
                pos_class = pos_tagger_to_idx[tag]  # These are the class #s
                target_values[i,j] = to_categorical(pos_class, num_classes=TAG_SET_SIZE)

        yield (input_values, target_values)

#### Test the batchifier

In [None]:
#prep_batch_for_network(["Mr. Smith works at Red Cat Labs .".split(' ')], include_targets=True)
next(batch_for_network_generator())

### Define the RNN Symbolically

#### Lasagne RNN tutorial (including conventions &amp; rationale)

*  http://colinraffel.com/talks/hammer2015recurrent.pdf

#### Lasagne Examples

*  https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/recurrent.py
*  https://github.com/Lasagne/Recipes/blob/master/examples/lstm_text_generation.py

#### Good blog post series

*  http://www.wildml.com/2015/10/recurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano/

#### Keras Examples

* 
*  https://github.com/fchollet/keras/issues/5022 


In [None]:
#dir(word_embedding)
#word_embedding.max_count, word_embedding.no_components, 
#word_embedding.word_vectors.shape
word_embedding_rnn.shape

In [None]:
from tensorflow.contrib.keras.api.keras.preprocessing import sequence
from tensorflow.contrib.keras.api.keras.layers import Input, Embedding, GRU, Dense, Activation
from tensorflow.contrib.keras.api.keras.models import Model

# Hmm : The following is not in the API...
from tensorflow.contrib.keras.python.keras.layers import Bidirectional, TimeDistributed

In [None]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(word_embedding_rnn.shape[0],
                            EMBEDDING_DIM,
                            weights=[ word_embedding_rnn ],
                            input_length=SENTENCE_LENGTH_MAX,
                            trainable=False, 
                            mask_zero=True,
                            name="SentencesEmbedded")

In [None]:
#x_train = sequence.pad_sequences(x_train, maxlen=SENTENCE_LENGTH_MAX)
#x_test  = sequence.pad_sequences(x_test,  maxlen=SENTENCE_LENGTH_MAX)
TAG_SET_SIZE

In [None]:
tokens_input = Input(shape=(SENTENCE_LENGTH_MAX,), dtype='int32', name="SentencesTokens")
embedded_sequences = embedding_layer(tokens_input)

#extra_input = ...

aggregate_vectors = embedded_sequences # concat...

rnn_outputs = Bidirectional( GRU(RNN_HIDDEN_SIZE, return_sequences=True),  merge_mode='concat' )(aggregate_vectors)

is_ner_outputs  = TimeDistributed( Dense(TAG_SET_SIZE, activation='softmax'), 
                                   input_shape=(BATCH_SIZE, SENTENCE_LENGTH_MAX, RNN_HIDDEN_SIZE*2),
                                   name='POS-class')(rnn_outputs)

#is_ner_outputs =  TimeDistributed( Activation('softmax'), 
#                                   input_shape=(BATCH_SIZE, SENTENCE_LENGTH_MAX, TAG_SET_SIZE),                                  
#                                   name='POS-class' )(is_ner_logits)


In [None]:
model = Model(inputs=[tokens_input], outputs=[is_ner_outputs])
print( model.summary() )

### Loss Function for Training

In [None]:
model.compile(loss='categorical_crossentropy', optimizer="adam")  # , metrics=['accuracy']

### Training phase for the RNN

In [None]:
#model.fit(x, y_one_hot)
model.fit_generator(batch_for_network_generator(), 1000, epochs=1, )

### Save the learned parameters

Uncomment the ```pickle.dump()``` to actually save to disk

In [None]:
weights_file = './data/RNN/tagger_rnn_trained_keras.h5'

# Actually, this includes the embedding, which is a little redundant
#model.save_weights(weights_file)

### Load pretrained weights into network

In [None]:
model.load_weights(weights_file)

### Check that the Tagger Network 'works'

In [None]:
def tag_results_for(test_sentences):
    input_values, mask_values, target_values_int = prep_batch_for_network(test_sentences, include_targets=True)

    rnn_output_, = rnn_predict(input_values, mask_values)

    # rnn_output_ here is a softmax-vector at every word location
    for i,sent in enumerate(test_sentences[0:5]):
        annotated = [ 
                "%s-%d-%d" % (word, target_values_int[i,j], np.argmax(rnn_output_[i,j]), )    
                for j,word in enumerate(sent) 
            ]
        print(' '.join(annotated))

In [None]:
sentences=[
    "Dr. Andrews works at Red Cat Labs .",
    "Let 's see what part of speech analysis on this sample text looks like .",
    "When are you off to New York , Chaitanya ?",
]

#test_sentences = batch_sentences()
test_sentences_mixed = [ s.split(' ') for s in sentences ]
test_sentences_single = [ s.lower().split(' ') for s in sentences ]
#test_sentences_single = [ s.upper().split(' ') for s in sentences ]

print("Format : WORD-NLTK-RNN")

tag_results_for(test_sentences_mixed)
print()
tag_results_for(test_sentences_single)

###  And let's look at the Statistics

... actually, looking at the above samples, the NLTK PoS tagger is HOPELESS when the text is converted to a single case.  QED

### Exercises

1.  Make the tagger identify different PoS (say : 'verbs')

2.  Make the tagger return several different tags instead

3.  See whether more advanced 'LSTM' nodes would improve the scores

4.  Add a special 'is_uppercase' element to the embedding vector (or, more simply, just replace one of the elements with an indicator).  Does this help the NNP accuracy?