In [1]:
import numpy as np
import tensorflow as tf
import random
from datetime import datetime
from tqdm import tqdm_notebook

from wordvector import WordVector
import docload

In [2]:
files = ['../data/adventures_of_sherlock_holmes.txt',
        '../data/hound_of_the_baskervilles.txt',
        '../data/sign_of_the_four.txt']
word_array, dictionary, num_lines, num_words = docload.build_word_array(
    files, vocab_size=50000, gutenberg=True)

print('Document loaded and processed: {} lines, {} words.'
      .format(num_lines, num_words))

Document loaded and processed: 24080 lines, 244986 words.


In [3]:
class Config(object):
    """Model parameters"""
    def __init__(self, num_words):
        self.vocab_size = num_words
        self.batch_size = 32
        self.num_rnn_steps = 10
        self.embed_size = 128
        self.rnn_size = 256
        self.hidden_size = 128
        self.rui_init = 0.01  # maxval, -minval for random_uniform_initializer
        self.vsi_init = 0.01  # stddev multiplier (factor) for variance_scaling_initializer
        self.neg_samples = 64  # for noise contrastive estimation (candidate sampling loss function)
        self.learn_rate = 0.1
        self.momentum = 0.9
        self.epochs = 1000

config = Config(len(dictionary))

In [4]:
# aliases for especially long TensorFlow calls
rui = tf.random_uniform_initializer
vsi = tf.contrib.layers.variance_scaling_initializer

rui_initializer = rui(-config.rui_init, config.rui_init, dtype=tf.float32)
vsi_initializer = vsi(factor=config.vsi_init, dtype=tf.float32)
zero_initializer = tf.zeros_initializer()

In [5]:
def feeder(config, word_array):
    """Generator. Yields training example tuples: (input, target).

    Args:
        config: Config object with model parameters.
        word_array: np.array (int), as generated by docload.build_word_array()

    Returns:
        Yields a tuple of NumPy arrays: (input, target)
    """
    batch_width = len(word_array) // config.batch_size
    data = np.reshape(word_array[0 : config.batch_size*batch_width],
                      (config.batch_size, batch_width))
    shuffle_index = [x for x in range(batch_width - config.num_rnn_steps - 1)]
    random.shuffle(shuffle_index)
    for i in shuffle_index:
        x = data[:, (i):(i+config.num_rnn_steps)]
        y = data[:, (i+1):(i+config.num_rnn_steps+1)]
        yield (x, y)
        
def epoch_len(config, word_array):
    """Number of training steps in an epoch. Used for progress bar"""
    batch_width = len(word_array) // config.batch_size
    return batch_width - config.num_rnn_steps - 1

In [6]:
def model(config):
    '''Embedding layer and RNN'''

    with tf.name_scope('embedding'):
        x = tf.placeholder(tf.int32, shape=(config.batch_size, config.num_rnn_steps), name='input')
        with tf.variable_scope('embedding', initializer=rui_initializer):
            embed_w = tf.get_variable('w', [config.vocab_size, config.embed_size])
        embed_out = tf.nn.embedding_lookup(embed_w, x, name='output')
            
#     with tf.name_scope('rnn'):    
    with tf.variable_scope('rnn', initializer=vsi_initializer):
        rnn_cell = tf.contrib.rnn.BasicLSTMCell(config.rnn_size, activation=tf.tanh)
        rnn_out, state = tf.nn.dynamic_rnn(rnn_cell, embed_out, dtype=tf.float32)
    
    with tf.name_scope('hidden'):
        rnn_last_output = rnn_out[:, config.num_rnn_steps-1, :]
        with tf.variable_scope('hidden'):
            hid_w = tf.get_variable('w', (config.rnn_size, config.hidden_size),
                                   initializer=vsi_initializer)
            hid_b = tf.get_variable('b', config.hidden_size, initializer=zero_initializer)
        hid_out = tf.nn.tanh(tf.matmul(rnn_last_output, hid_w) + hid_b)
            
    return hid_out, x

In [7]:
def loss(config, hid_out):
    """Loss Function: noise contrastive estimation on final output of RNN"""
    with tf.name_scope('loss'):
        y = tf.placeholder(tf.int32, shape=(config.batch_size, config.num_rnn_steps))
        y_last = tf.reshape(y[:, config.num_rnn_steps-1], [config.batch_size, 1],
                           name='target')
        with tf.variable_scope('loss'):
            w = tf.get_variable('w', (config.vocab_size, config.hidden_size),
                                   initializer=vsi_initializer)
            b = tf.get_variable('b', config.vocab_size, initializer=zero_initializer)

        batch_loss = tf.reduce_mean(
            tf.nn.nce_loss(w, b, inputs=hid_out, labels=y_last,
                           num_sampled=config.neg_samples,
                           num_classes=config.vocab_size,
                           num_true=1), name='batch_loss')
    with tf.name_scope('predict'):
        y_hat = tf.argmax(tf.matmul(hid_out, w, transpose_b=True) + b, axis=1)
    
    return y, batch_loss, y_hat

In [8]:
def train(config, batch_loss):
    with tf.name_scope('optimize'):
#         optimizer = tf.train.GradientDescentOptimizer(config.learn_rate)
        optimizer = tf.train.MomentumOptimizer(config.learn_rate, config.momentum)
        train_op = optimizer.minimize(batch_loss, name='minimize_op')
    
    return train_op

In [9]:
def training_monitor(batch_loss):
    with tf.name_scope('train_monitor'):
        with tf.variable_scope('train_monitor'):
            iteration = tf.get_variable('iteration', [], initializer=tf.zeros_initializer())
            total_loss = tf.get_variable('total_loss', [], initializer=tf.zeros_initializer())
        iter_update = tf.assign_add(iteration, tf.convert_to_tensor(1, dtype=tf.float32))
        loss_update = tf.assign_add(total_loss, batch_loss)
        avg_loss = tf.divide(loss_update, iter_update)
    
    return avg_loss, iteration, total_loss

In [10]:
def summary(config, avg_loss):
    with tf.name_scope('summary'):
        loss_monitor = tf.summary.scalar('loss_monitor', avg_loss)
    now = datetime.utcnow().strftime("%m%d%H%M")
    logdir = "../tf_logs/run-{}/".format(now)
    summary_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
    
    return summary_writer, loss_monitor   

In [11]:
class MyGraph(object):
    def __init__(self, config):
        self.hid_out, self.x = model(config)
        self.y, self.batch_loss, self.y_hat = loss(config, self.hid_out)
        self.train_op = train(config, self.batch_loss)
        self.avg_loss, self.iteration, self.tot_loss = training_monitor(self.batch_loss)
        self.init = tf.global_variables_initializer()
        self.summary_writer, self.loss_monitor = summary(config, self.avg_loss)
        self.saver = tf.train.Saver()

In [12]:
with tf.Graph().as_default():
    g = MyGraph(config)
    with tf.Session() as sess:
        sess.run(g.init)
        counter = 0
        for e in range(config.epochs):
            for t in tqdm_notebook(feeder(config, word_array),
                                   total=epoch_len(config, word_array),
                                  desc='Epoch #{}'.format(e+1)):
                counter += 1
                feed = {g.x: t[0], g.y: t[1]}
                [_, l, i, summary_str] = sess.run([g.train_op, g.avg_loss,
                                                   g.iteration, g.loss_monitor],
                                                  feed_dict=feed)
                if (i % 100) == 0:
                    g.summary_writer.add_summary(summary_str, counter)
                    sess.run(g.iteration, feed_dict={g.iteration: 0, g.tot_loss: 0})
            print(l)
        save_path = g.saver.save(sess, "../tmp/my_model.ckpt")
        g.summary_writer.close()


18.9931

12.1337

9.67044

8.36546

7.53386

6.94486

6.5103

6.16606

5.88737

5.65325

5.45366

5.28169

5.12983

4.99445

4.87302

4.76252

4.66147

4.56844

4.48205

4.40158

4.32624

4.25585

4.18991

4.12797

4.06972

4.01454

3.96203

3.91238

3.86517

3.82026

3.77749

3.73671

3.69792

3.66109

3.62582

3.59221

3.56003

3.52927

3.49982

3.47149

3.44418

3.41847

3.39351

3.36964

3.34695

3.32517

3.30415

3.28412

3.26484

3.24646

3.22841

3.21097

3.19416

3.17799

3.16267

3.14795

3.1344

3.12141

3.10871

3.09609

3.08401

3.07211

3.06063

3.04959

3.03886

3.02847

3.01858

3.00898

2.99954

2.9905

2.98168

2.97309

2.9648

2.95668

2.94877

2.941

2.93345

2.92621

2.91919

2.91216

2.90525

2.89849

2.89181

2.88536

2.8791

2.87301

2.86693

2.86092

2.85517

2.84955

2.84404

2.83862

2.83325

2.82786

2.82264

2.81756

2.81251

2.80777

2.80305

2.79838

2.7936

2.78894

2.78454

2.78017

2.77591

2.77164

2.7674

2.76332

2.75925

2.7553

2.7513

2.74736

2.

KeyboardInterrupt: 

In [None]:
config.batch_size = 1

start = 1400  # start position in document
input = word_array[start:(start+config.num_rnn_steps)]
with tf.Graph().as_default():
    g = MyGraph(config)
    with tf.Session() as sess:
        g.saver.restore(sess, "../tmp/my_model.ckpt")
        for i in range(100):
            feed = {g.x: np.reshape(input[i:(i+config.num_rnn_steps)], (1, -1))}
            [pred] = sess.run([g.y_hat], feed_dict=feed)
            input = np.append(input, [pred])


In [None]:
reverse_dict = {v: k for k, v in dictionary.items()}

In [None]:
# print predicted passage
passage_predict = [x for x in map(lambda x: reverse_dict[x], input)]
readable = ''
for word in passage_predict:
    if word == '"':
        readable += word
    elif word in ['?', '!', '.', ',']:
        readable += word + ' '
    else: 
        readable += ' ' + word
print(readable)