In [1]:
import numpy as np
import tensorflow as tf
import random
from datetime import datetime
from tqdm import tqdm_notebook

from wordvector import WordVector
import docload

In [2]:
files = ['../data/adventures_of_sherlock_holmes.txt',
        '../data/hound_of_the_baskervilles.txt',
        '../data/sign_of_the_four.txt']
word_array, dictionary, num_lines, num_words = docload.build_word_array(
    files, vocab_size=50000, gutenberg=True)

print('Document loaded and processed: {} lines, {} words.'
      .format(num_lines, num_words))

Document loaded and processed: 24080 lines, 244986 words.


In [3]:
class Config(object):
    """Model parameters"""
    def __init__(self, num_words):
        self.vocab_size = num_words
        self.batch_size = 32
        self.num_rnn_steps = 20
        self.embed_size = 128
        self.rnn_size = 256
        self.hidden_size = 128
        self.rui_init = 0.01  # maxval, -minval for random_uniform_initializer
        self.vsi_init = 0.01  # stddev multiplier (factor) for variance_scaling_initializer
        self.neg_samples = 64  # for noise contrastive estimation (candidate sampling loss function)
        self.learn_rate = 0.1
        self.momentum = 0.9
        self.epochs = 100

config = Config(len(dictionary))

In [4]:
tf.__version__

'1.0.0'

In [5]:
# aliases for especially long TensorFlow calls
rui = tf.random_uniform_initializer
vsi = tf.contrib.layers.variance_scaling_initializer

rui_initializer = rui(-config.rui_init, config.rui_init, dtype=tf.float32)
vsi_initializer = vsi(factor=config.vsi_init, dtype=tf.float32)
zero_initializer = tf.zeros_initializer(dtype=tf.float32)

In [6]:
def feeder(config, word_array):
    """Generator. Yields training example tuples: (input, target).

    Args:
        config: Config object with model parameters.
        word_array: np.array (int), as generated by docload.build_word_array()

    Returns:
        Yields a tuple of NumPy arrays: (input, target)
    """
    batch_width = len(word_array) // config.batch_size
    data = np.reshape(word_array[0 : config.batch_size*batch_width],
                      (config.batch_size, batch_width))
    shuffle_index = [x for x in range(batch_width - config.num_rnn_steps - 1)]
    random.shuffle(shuffle_index)
    for i in shuffle_index:
        x = data[:, (i):(i+config.num_rnn_steps)]
        y = data[:, (i+1):(i+config.num_rnn_steps+1)]
        yield (x, y)
        
def epoch_len(config, word_array):
    """Number of training steps in an epoch. Used for progress bar"""
    batch_width = len(word_array) // config.batch_size
    return batch_width - config.num_rnn_steps - 1

In [7]:
def model(config):
    '''Embedding layer and RNN'''

    with tf.name_scope('embedding'):
        x = tf.placeholder(tf.int32, shape=(config.batch_size, config.num_rnn_steps), name='input')
        with tf.variable_scope('embedding', initializer=rui_initializer):
            embed_w = tf.get_variable('w', [config.vocab_size, config.embed_size])
        embed_out = tf.nn.embedding_lookup(embed_w, x, name='output')
            
#     with tf.name_scope('rnn'):    
    with tf.variable_scope('rnn', initializer=vsi_initializer):
        rnn_cell = tf.contrib.rnn.BasicLSTMCell(config.rnn_size, activation=tf.tanh)
        rnn_out, state = tf.nn.dynamic_rnn(rnn_cell, embed_out, dtype=tf.float32)
    
    with tf.name_scope('hidden'):
        rnn_last_output = rnn_out[:, config.num_rnn_steps-1, :]
        with tf.variable_scope('hidden'):
            hid_w = tf.get_variable('w', (config.rnn_size, config.hidden_size),
                                   initializer=vsi_initializer)
            hid_b = tf.get_variable('b', config.hidden_size, initializer=zero_initializer)
        hid_out = tf.nn.tanh(tf.matmul(rnn_last_output, hid_w) + hid_b)
            
    return hid_out, x

In [8]:
def loss(config, hid_out):
    """Loss Function: noise contrastive estimation on final output of RNN"""
    with tf.name_scope('loss'):
        y = tf.placeholder(tf.int32, shape=(config.batch_size, config.num_rnn_steps))
        y_last = tf.reshape(y[:, config.num_rnn_steps-1], [config.batch_size, 1],
                           name='target')
        with tf.variable_scope('loss'):
            w = tf.get_variable('w', (config.vocab_size, config.hidden_size),
                                   initializer=vsi_initializer)
            b = tf.get_variable('b', config.vocab_size, initializer=zero_initializer)

        batch_loss = tf.reduce_mean(
            tf.nn.nce_loss(w, b, inputs=hid_out, labels=y_last,
                           num_sampled=config.neg_samples,
                           num_classes=config.vocab_size,
                           num_true=1), name='batch_loss')
    with tf.name_scope('predict'):
        y_hat = tf.argmax(tf.matmul(hid_out, w, transpose_b=True) + b, axis=1)
    
    return y, batch_loss, y_hat

In [9]:
def train(config, batch_loss):
    with tf.name_scope('optimize'):
#         optimizer = tf.train.GradientDescentOptimizer(config.learn_rate)
        optimizer = tf.train.MomentumOptimizer(config.learn_rate, config.momentum)
        train_op = optimizer.minimize(batch_loss, name='minimize_op')
    
    return train_op

In [10]:
def training_monitor(batch_loss):
    with tf.name_scope('train_monitor'):
        with tf.variable_scope('train_monitor'):
            iteration = tf.get_variable('iteration', [], initializer=tf.zeros_initializer())
            total_loss = tf.get_variable('total_loss', [], initializer=tf.zeros_initializer())
        iter_update = tf.assign_add(iteration, tf.convert_to_tensor(1, dtype=tf.float32))
        loss_update = tf.assign_add(total_loss, batch_loss)
        avg_loss = tf.divide(loss_update, iter_update)
    
    return avg_loss, iteration, total_loss

In [11]:
def summary(config, avg_loss):
    with tf.name_scope('summary'):
        loss_monitor = tf.summary.scalar('loss_monitor', avg_loss)
    now = datetime.utcnow().strftime("%m%d%H%M")
    logdir = "../tf_logs/run-{}/".format(now)
    summary_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
    
    return summary_writer, loss_monitor   

In [12]:
class MyGraph(object):
    def __init__(self, config):
        self.hid_out, self.x = model(config)
        self.y, self.batch_loss, self.y_hat = loss(config, self.hid_out)
        self.train_op = train(config, self.batch_loss)
        self.avg_loss, self.iteration, self.tot_loss = training_monitor(self.batch_loss)
        self.init = tf.global_variables_initializer()
        self.summary_writer, self.loss_monitor = summary(config, self.avg_loss)
        self.saver = tf.train.Saver()

In [13]:
with tf.Graph().as_default():
    g = MyGraph(config)
    with tf.Session() as sess:
        sess.run(g.init)
        counter = 0
        for e in range(config.epochs):
            for t in tqdm_notebook(feeder(config, word_array),
                                   total=epoch_len(config, word_array),
                                  desc='Epoch #{}'.format(e+1)):
                counter += 1
                feed = {g.x: t[0], g.y: t[1]}
                [_, l, i, summary_str] = sess.run([g.train_op, g.avg_loss,
                                                   g.iteration, g.loss_monitor],
                                                  feed_dict=feed)
                if (i % 100) == 0:
                    g.summary_writer.add_summary(summary_str, counter)
                    sess.run(g.iteration, feed_dict={g.iteration: 0, g.tot_loss: 0})
            print(l)
        save_path = g.saver.save(sess, "../tmp/my_model_20.ckpt")
        g.summary_writer.close()


19.4659

12.4026

9.83976

8.47563

7.61749

7.01144

6.55689

6.20124

5.91108

5.66965

5.46636

5.28955

5.13436

4.99574

4.87152

4.75909

4.65683

4.56232

4.47583

4.39516

4.31954

4.24961

4.18405

4.12277

4.06516

4.01042

3.95903

3.91065

3.86405

3.81971

3.77761

3.73748

3.69912

3.66235

3.62754

3.59452

3.56273

3.53242

3.5032

3.47551

3.44887

3.42313

3.39866

3.37514

3.35254

3.33096

3.31001

3.28975

3.27034

3.25192

3.23426

3.21699

3.2003

3.18445

3.16921

3.15461

3.14056

3.12702

3.11396

3.10118

3.0888

3.07698

3.06549

3.05442

3.04371

3.03323

3.02328

3.0136

3.00421

2.99527

2.9864

2.97798

2.96996

2.96202

2.95445

2.94698

2.93967

2.93276

2.92597

2.91926

2.91287

2.90676

2.90053

2.89439

2.88837

2.88253

2.87689

2.87132

2.86574

2.86034

2.85514

2.85018

2.84527

2.84057

2.83588

2.83109

2.82646

2.82199

2.81763

2.81322


In [25]:
config.batch_size = 1

start = 111700  # start position in document
input = word_array[start:(start+config.num_rnn_steps)]
with tf.Graph().as_default():
    g = MyGraph(config)
    with tf.Session() as sess:
        g.saver.restore(sess, "../tmp/my_model_20.ckpt")
        for i in range(100):
            feed = {g.x: np.reshape(input[i:(i+config.num_rnn_steps)], (1, -1))}
            [pred] = sess.run([g.y_hat], feed_dict=feed)
            input = np.append(input, [pred])


In [15]:
reverse_dict = {v: k for k, v in dictionary.items()}

In [26]:
# print predicted passage
passage_predict = [x for x in map(lambda x: reverse_dict[x], input)]
readable = ''
for word in passage_predict:
    if word == '"':
        readable += word
    elif word in ['?', '!', '.', ',']:
        readable += word + ' '
    else: 
        readable += ' ' + word
print(readable)

"" it is unfortunately more than possible it is certain.  neither you nor your son knew the true solution of human life sounds feasible self control to burst off. "" you'll find me in that god! " he relapsed into which led over the table land,  and then third house with lolling tongue over the leaves yonder within green patches by leaps and wavered all round him. " 'may you ask?  i guards over her motion like over again,  and her stepfather was exceedingly angry,  if i am going,  frankly.  i don't feel easy in proving murder against my nerves talked over to each cases.  tonga then
