In [12]:

import json, os, re, shutil, sys, time
import collections, itertools


# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries
from shared_lib import utils, vocabulary, rnnlm


In [16]:
# Load the dataset
# assert(nltk.download('gutenberg'))
V = 10000
vocab, train_ids, test_ids = utils.load_corpus("gutenberg", split=0.8, V=V, shuffle=42)

Loaded 98552 sentences (2.62178e+06 tokens)
Training set: 78841 sentences (2089056 tokens)
Test set: 19711 sentences (532729 tokens)


In [17]:
TF_GRAPHDIR = "tf_graph"

# Clear old log directory.
shutil.rmtree(TF_GRAPHDIR, ignore_errors=True)

lm = rnnlm.RNNLM(V=10000, H=200, num_layers=2)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildSamplerGraph()

summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph)

(?, 10000)
(?, ?, 1)


In [18]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=0.1):
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = False
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        cost = 0.0
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})

        #### YOUR CODE HERE ####
        feed_dict = {lm.input_w_: w, lm.target_y_: y, lm.initial_h_: h}
        ## print "Calling computational graph"
        cost, train_op_ = session.run([loss, train_op], feed_dict)
        #### END(YOUR CODE) ####
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time
        

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (
                i, total_words, avg_wps, avg_cost)
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

In [19]:
def score_dataset(lm, session, ids, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.batch_generator(ids, batch_size=100, max_time=100)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=1.0, train=False, 
                     verbose=False, tick_s=3600)
    print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))

In [21]:
max_time = 20
batch_size = 50
learning_rate = 0.5
num_epochs = 5

# Model parameters
model_params = dict(V=vocab.size, 
                    H=100, 
                    softmax_ns=200,
                    num_layers=1)

TF_SAVEDIR = "tf_saved"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [22]:
# Will print status every this many seconds
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in xrange(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils.batch_generator(train_ids, batch_size, max_time)
        print "[epoch %d] Starting epoch %d" % (epoch, epoch)
        #### YOUR CODE HERE ####
        # Run a training epoch.
        
        run_epoch(lm, session, bi,
              train=True, verbose=True,
              tick_s=10, learning_rate=0.1)
        
        #### END(YOUR CODE) ####
        print "[epoch %d] Completed in %s" % (epoch, utils.pretty_timedelta(since=t0_epoch))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        print ("[epoch %d]" % epoch),
        score_dataset(lm, session, train_ids, name="Train set")
        print ("[epoch %d]" % epoch),
        score_dataset(lm, session, test_ids, name="Test set")
        print ""
    
    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[batch 63]: seen 64000 words at 6345 wps, loss = 6.477
[batch 133]: seen 134000 words at 6659 wps, loss = 5.980
[batch 200]: seen 201000 words at 6659 wps, loss = 5.707
[batch 270]: seen 271000 words at 6726 wps, loss = 5.513
[batch 340]: seen 341000 words at 6771 wps, loss = 5.366
[batch 409]: seen 410000 words at 6791 wps, loss = 5.258
[batch 479]: seen 480000 words at 6808 wps, loss = 5.170
[batch 549]: seen 550000 words at 6821 wps, loss = 5.094
[batch 619]: seen 620000 words at 6832 wps, loss = 5.027
[batch 688]: seen 689000 words at 6838 wps, loss = 4.970
[batch 757]: seen 758000 words at 6843 wps, loss = 4.922
[batch 827]: seen 828000 words at 6852 wps, loss = 4.877
[batch 897]: seen 898000 words at 6856 wps, loss = 4.837
[batch 965]: seen 966000 words at 6850 wps, loss = 4.801
[batch 1036]: seen 1037000 words at 6862 wps, loss = 4.770
[batch 1106]: seen 1107000 words at 6868 wps, loss = 4.739
[batch 1176]: seen 1177000 words at 6875 wps, loss = 4.710


In [35]:
def score_seq(lm, session, seq, vocab):
    """Score a sequence of words. Returns total log-probability."""
    padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq + ["</s>"], 
                                                             wordset=vocab.word_to_id))
    w = np.reshape(padded_ids[:-1], [1,-1])
    y = np.reshape(padded_ids[1:],  [1,-1])
    h = session.run(lm.initial_h_, {lm.input_w_: w})
    feed_dict = {lm.input_w_:w,
                 lm.target_y_:y,
                 lm.initial_h_:h,
                 lm.dropout_keep_prob_: 1.0}
    # Return log(P(seq)) = -1*loss
    return -1*session.run(lm.loss_, feed_dict)

def load_and_score(inputs, sort=False):
    """Load the trained model and score the given words."""
    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    
    with lm.graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=lm.graph) as session:  
        # Load the trained model
        saver.restore(session, trained_filename)

        if isinstance(inputs[0], str) or isinstance(inputs[0], unicode):
            inputs = [inputs]

        # Actually run scoring
        results = []
        for words in inputs:
            score = score_seq(lm, session, words, vocab)
            results.append((score, words))

        # Sort if requested
        if sort: results = sorted(results, reverse=True)

        # Print results
        for score, words in results:
            print "\"%s\" : %.02f" % (" ".join(words), score)

In [37]:
sents = ["i shall be miserable if i have not an excellent library.",
         "to be fond of dancing was a certain step towards falling in love.",
         "don't let the door hit you on your way out.",
         "he was totally into it.",
         "to be or not to be.",
         "come on y'all, don't be like that!",
         "get outta here"]
load_and_score([s.split() for s in sents])

INFO:tensorflow:Restoring parameters from tf_saved/rnnlm_trained
"i shall be miserable if i have not an excellent library." : -4.91
"to be fond of dancing was a certain step towards falling in love." : -6.22
"don't let the door hit you on your way out." : -5.82
"he was totally into it." : -5.13
"to be or not to be." : -4.63
"come on y'all, don't be like that!" : -5.00
"get outta here" : -6.03
