In [1]:
import os
import tensorflow as tf

import numpy
from numpy import array

from random import randint

from pickle import load
from pickle import dump 

from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences 

Using TensorFlow backend.


### Load Text

Loads the doc into the memory:

In [2]:
# loads the doc into memory
def load_doc(filename):
    # opens the file as read only
    file = open(filename, 'r')
    # reads all the text
    text = file.read()
    # closes the file
    file.close()
    return text

### Clean Text

Now, I need to transform the raw text into a sequence of tokens or words that I can use as a source to train my model. In this preprocessing step, I'll:
1. Replace '-' with a white space so I can split words better
2. Split words based on white space
3. Remove all punctuation from words to reduce the vocabulary size
4. Remove all words that aren't alphabetic to remove standalone punctuation tokens. 
5. Normalize all words to lowercase to reduce the vocabulary size.

Note how I'm changing vocabulary size with each cleaning iteration. It's a big deal - a smaller vocabulary results in a smaller model that trains faster. 

In [3]:
import string

# turns a document into clean tokens
def clean_doc(doc):
    # replaces "--" with a space ' '
    doc = doc.replace('--', ' ')
    # splits into tokens by white space 
    tokens = doc.split()
    # removes punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # removes remaining tokens that aren't alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # makes everything lower case
    tokens = [word.lower() for word in tokens]
    return tokens

### Saving Cleaned Text

Not 100% sure why I need to do this - but this is an experiment, so I'll just go along with it. 

I apparently want to create sequences of 51 words:

In [4]:
# saves tokens to file, one dialog per line:
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open((filename + '_sequences.txt'), 'w')
    file.write(data)
    file.close()

In [5]:
def preprocessing(document):
    # loads the document
    in_filename = document
    doc = load_doc(in_filename)
    # sanity check, uncomment to see first 200 characters
    # print(doc[:200])
    
    # FYI, not 100% sure about how variables get returned. Keep in mind for DB
    
    # cleans the document
    tokens = clean_doc(doc)
    # sanity checks, uncomment to see first 200 tokens
    print(tokens[:200])
    print('Total Tokens: %d' % len(tokens))
    print('Unique Tokens: %d' % len(set(tokens)))
    
    # organize into sequences of tokens
    length = 50 + 1
    sequences = list()
    for i in range(length, len(tokens)):
        # select sequence of tokens
        seq = tokens[i-length:i]
        # converts into a line
        line = ' '.join(seq)
        # store
        sequences.append(line)
    print('Total Sequences: %d' % len(sequences))
    
    # save sequences to file
    out_filename = 'example'
    save_doc(sequences, out_filename)

## Train the Language Model

The model I'm training is a neural language model -
1. It uses a distributed representation for words so different words with similar meanings will have a similar representation.
2. It learns the representation at the same time as learning the model
3. It learns to predict the probability for the next word using the context of the last 100 words

I will use the Embedding Layer to learn the representations of words, and a LSTM to learn to predict words based on their context. 

### Load Sequences
I can load my training data by using the load_doc() method I created above. Once loaded, I'll split the data into separate training sequences by splitting based on new lines. 

In [6]:
document = 'data/brown-test.txt'
# preprocesses the document and saves to file
preprocessing(document)

['my', 'impassioned', 'plea', 'for', 'civil', 'rights', 'created', 'a', 'landslide', 'of', 'correspondence', 'and', 'one', 'sponsor', 'even', 'asked', 'me', 'to', 'consider', 'replacing', 'the', 'eddie', 'cantor', 'comedy', 'hour', 'on', 'a', 'permanent', 'basis', 'but', 'what', 'quarter', 'could', 'a', 'poor', 'defenseless', 'woman', 'expect', 'from', 'a', 'dictator', 'who', 'would', 'even', 'make', 'so', 'bold', 'as', 'to', 'close', 'all', 'of', 'the', 'banks', 'in', 'our', 'great', 'nation', 'the', 'savage', 'barbarian', 'hordes', 'of', 'red', 'russian', 'communism', 'descended', 'on', 'the', 'athens', 'that', 'was', 'mighty', 'metronome', 'sacking', 'and', 'despoiling', 'with', 'their', 'bolshevistic', 'battle', 'cry', 'of', 'soak', 'the', 'rich', 'after', 'an', 'unspeakable', 'siege', 'lasting', 'the', 'better', 'part', 'of', 'two', 'months', 'it', 'was', 'announced', 'that', 'the', 'studio', 'owed', 'the', 'government', 'a', 'tax', 'debt', 'in', 'excess', 'of', 'eight', 'million'

In [7]:
# loads doc into memory
in_filename = "example_sequences.txt"
doc = load_doc(in_filename)
lines = doc.split('\n')

### Encode Sequences
The word embedding layer expects input sequences to be comprised of integers. I can map each word in my vocabulary to a unique integer and encode my input sequences. When my model will make predictions, I can convert the prediction to numbers and look up their associated words in the same mapping. 

To do this encoding, I'm using the Tokenizer class in the Keras API. 

The Tokenizer __must be trained on the entire training dataset__, which means that it finds all of the unique words in the data and assigns each a unique integer. 

I can then use the fit Tokenizer to encode all of the training sequences, converting each sequence from a list of words to a list of integers. 

I can access the mapping of words to integers as a dictionary attribute called `word_index` on the Tokenizer object. 

I'll need to know the size of the vocabulary for defining the embedding layer later. I can determrine the vocabulary by calculating the size of the mapping dictionary. 

Words are assigned values from 1 to the total number of words. __NOTE: This is what that one Keras tutorial was trying to avoid with multi-hot encoding. Pay attention to that fact when I'm either working with the model or typing my report to David__. The index of arrays is zero-offset (think: European floors). 

That means that when I'm specifying the vocabulary size to the Embedding layer, __I have to specify it as + 1 larger than the actual vocabulary data__. 

In [8]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

### Sequence Inputs and Outputs

Now that I've encoded the input sequences using Keras, I need to separate them into input (X) and ouput (y) elements. I'll do this with __array slicing__. 

After separating, I need to one-hot encode the output word (if you think back to my TensorFlow tutorial, this means converting it from an integer to a vector of 0 values, one for each word in the vocabulary, with a 1 to indicate the specific word at the index of the words' integer value). 

This is so the model learns to predict the probability distribution for the next word and the ground truth from which to learn from is 0 for all words except the actual word that comes next. 

`Keras` provides the `to_categorical()` method that can be used to one-hot encode the output words for each input-output sequence pair. 

Finally, I'll need to specify to the Embedding layer how long input sequences are. I know that there are 50 words becuase I designed the model, but its normally better to use the second dimension (i.e. the number of columns) of the input data's shape. That way, if I decide to change the length of the sequences when preparing data, I don't need to change this data loading code and it can work generically. 

In [9]:
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

### Fitting the Model

Now I can define and fit the language model on the training data. As inputs, the learned embedding needs to know the size of the vocabulary and the length of input sequences. It also needs a parameter to specify how many dimensions will be used to represent each word (i.e., the size of the embedding vector space). For this, common values are 50, 100, and 300. I'll use 50 here. 

I will use two LSTM hidden layers with 100 memory cells each. 

A dense fully connected layer with 100 neurons connects to the LSTM hidden layers to interpret the features extracted from the sequence. In Bengio's model, he uses a hidden layer I didn't understand and another tanh activation layer, and then a softmax. Not sure if those were dense. 

In [10]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
# sanity check 
print(model.summary())
# FYI, training might take a couple hours without a GPU. I can
#    speed it up with a larger batch size AND/OR fewer training
#    epochs (see parameters below). 
    
# compiles the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
# fits the model
model.fit(X, y, batch_size=128, epochs=100)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            196750    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3935)              397435    
Total params: 745,085
Trainable params: 745,085
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Epoch 1/100

KeyboardInterrupt: 

### Saving the Model

The trained model, at the end of the run, should be saved to file. I'll use the Keras API to save the model as 'model.h5' to the CWD. 

Later, when I load the model to make predictions, I'll also need the mapping of words to integers. This is in the Tokenizer object and I can save that using Pickle. 

In [None]:
# saves the model
model.save('model.h5')

# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

## Using the Language Model

Now that I have all that saved, I can use it! __This isn't what the Bengio Model is supposed to return__. Instead, it returns generated new sequences of text that have the same statistical properties as the source text. This is just to demonstrate the pipeline. 

### Load the Data
I can use the same code from the previous section to load the training data sequences. I need the text so that I can choose a source sequence as an input to the model to generate a new sequence of text. 

The model will require 100 words as an input. 

Later, I'll need to specify the expected length of input. __I can deterrmine this from input sequences by calculating the length of one line of the loaded data and subtracting 1 for the expected output word that is also on the same line__. 

In [None]:
# load the doc into memory and load the cleaned text sequences
in_filename = #
doc = load_doc(in_filename)
lines = doc.split('\n')

### Load the Model
I can also now load the model from file. 

Keras provides the `load_model()` function for loading the model, ready for use. I can also load the tokenizer using the Pickle API. 

In [None]:
# loads the model
model = load_model('model.h5')

# loads the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

### Generate Text

The first step in generating text is __preparing a seed input__. I will select a random line of text from the input text for this purpose. Once selected, I'll print it just for a sanity check. 

In [None]:
# select a seed text
seed_text = lines[randint(0, len(lines))]
print(seed_text + '\n')

Now, I can generate new words, one at a time. 

First, the seed text must be encoded to integers using the same tokenzier that I used when training the model. 

In [None]:
# encoded = tokenizer.texts_to_sequences([seed_text])[0]

The model can predict the next word directly by calling `model.predict_classes()` that will return the index of the word with the highest probability. 

In [None]:
# predict probabilities for each word
# yhat = model.predict_classes(encoded, verbose=0)

I can then look up the index in the Tokenizers mapping to get the associated word.

In [None]:
# out_word = ''
"""
for word, index in tokenizer.word_index.items():
    if index == yhat:
        out_word = word
        break
"""

I can then append this to the seed text and repeat the process. 

The input sequence is __going to get too long__. I can truncate it to the desired length after the input sequence has been encoded to integers. Keras provides the `pad_sequences()` function that we can use to perform this truncation.

In [None]:
# encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')

Now I'm going to wrap all of the above functions into a single function called `generate_seq()` that takes as its input: the model, the tokenizer, the input sequence length, the seed text, and the number of words to generate. It will return a sequence of words generated by the model.

In [None]:
# generates a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    #generate a fixed number of words
    for _ in range(n_words):
        # encode the text as an integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

Now I'm going to generate a sequence of new words given some seed text:

In [None]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)