In [1]:
import collections
import numpy as np
import os
import random
import string
import tensorflow as tf
import zipfile
from matplotlib import pylab
import time

%matplotlib inline

In [2]:
def get_data(f_name):
    with zipfile.ZipFile(f_name) as f:
        nl = f.namelist() #a list of the names of files in the zip directory only one in our cast
        data = f.read(nl[0])
    return data

text = get_data('text8.zip')
print 'Text Len: ', len(text)

Text Len:  100000000


Create a small Validation Set

In [3]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)

print train_size, train_text[:64]
print valid_size, valid_text[:64]

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [4]:
VOCAB_SIZE = len(string.ascii_lowercase) + 1 #27, [a-z] and __space___

def char_to_id(char):
    return string.ascii_lowercase.find(char) + 1

def id_to_char(char_id):
    return string.ascii_lowercase[char_id - 1] if char_id > 0 else ' '


print char_to_id('a'), id_to_char(1), id_to_char(char_to_id('a'))
print char_to_id('z'), id_to_char(26), id_to_char(char_to_id('z'))
print '>'+id_to_char(char_to_id('/'))+'<'
    

1 a a
26 z z
> <


In [5]:
NUM_UNROLLING = 10
BATCH_SIZE = 64

class BatchGenerator(object):
    def __init__(self, text, batch_size, num_unrolling):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrolling = num_unrolling
        
        segment = self._text_size/self._batch_size
        self._cursor = [offset * segment for offset in range(self._batch_size)]
        
        self._last_batch = self._next_batch()
    
    def _next_batch(self):
        batch = np.zeros((self._batch_size, VOCAB_SIZE), dtype = np.float32)
        for i in range(self._batch_size):
            curs = self._cursor[i]
            batch[i, char_to_id(self._text[curs])] = 1.0
            self._cursor[i] = (self._cursor[i] + 1) % self._text_size
        return batch
    
    def next(self):
        """
        generates a number of (num_unrolling + 1) matrices. where each matrix represents a batch of characters at each
        timestep.
        
        """
        batches = [self._last_batch]
        
        for _ in range(self._num_unrolling):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches
    
def characters(probabilities):
    """
    a batch of probabilities of the shape (batch_size, vocab_size)
    """
    return [id_to_char(c) for c in np.argmax(probabilities, 1)]

def batches_to_strings(batches):
    batch_strings = characters(batches[0])
    for batch in batches[1:]:
        batch_strings = [s+c for s, c in zip(batch_strings, characters(batch))]
    return batch_strings
    
    
train_batches = BatchGenerator(train_text, BATCH_SIZE, NUM_UNROLLING)
valid_batches = BatchGenerator(valid_text, 1, 1)

print batches_to_strings(train_batches.next())
print batches_to_strings(train_batches.next())
print batches_to_strings(valid_batches.next())
print batches_to_strings(valid_batches.next())


['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [6]:
def logprob(preds, labels):
    preds[preds < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(preds)))/labels.shape[0]

def random_dist(vocab_size = VOCAB_SIZE):
    tmp = np.random.uniform(0.0, 1.0, size=(1, vocab_size))
    return tmp / tmp.sum()

def sample_from_dist(distribution, vocab_size = VOCAB_SIZE):
    rand = np.random.uniform()
    idx, cumm_sum = 0, 0.0
    for prob in distribution:
        cumm_sum += prob
        if rand < cumm_sum:
            return idx  
        idx += 1
    return idx

def one_hot_encoded_sample(prediction, vocab_size = VOCAB_SIZE):
    p = np.zeros((1, vocab_size))
    p[0, sample_from_dist(prediction[0])] =  1.0
    return p
    
    

In [7]:
characters(one_hot_encoded_sample(random_dist()))[0]

'j'

In [8]:
NUM_NODES = 64

graph = tf.Graph()

with graph.as_default():
    #Parameters:

    # Input Gate: input, prev output, and bias
    ix = tf.Variable(tf.truncated_normal([VOCAB_SIZE, NUM_NODES], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([NUM_NODES, NUM_NODES], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, NUM_NODES]))

    # Forget Gate: input, prev output, and bias
    fx = tf.Variable(tf.truncated_normal([VOCAB_SIZE, NUM_NODES], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([NUM_NODES, NUM_NODES], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, NUM_NODES]))

    # Memory Cell: input, state, and bias
    cx = tf.Variable(tf.truncated_normal([VOCAB_SIZE, NUM_NODES], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([NUM_NODES, NUM_NODES], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, NUM_NODES]))

    # Output Gates: input, state, and bias
    ox = tf.Variable(tf.truncated_normal([VOCAB_SIZE, NUM_NODES], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([NUM_NODES, NUM_NODES], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, NUM_NODES]))

    #Variables for storing state across unrollings
    saved_output = tf.Variable(tf.zeros([BATCH_SIZE, NUM_NODES]), trainable=False)
    saved_state = tf.Variable(tf.zeros([BATCH_SIZE, NUM_NODES]), trainable=False)

    # Classifier weights and biases
    w = tf.Variable(tf.truncated_normal([NUM_NODES, VOCAB_SIZE], -0.1, 0.1))
    b = tf.Variable(tf.zeros([VOCAB_SIZE]))


    # Definition of teh cell computation

    def lstm_cell(inp, out, state):
        """
        Create a LSTM Cell.
        """
        input_gate = tf.sigmoid(tf.matmul(inp, ix) +tf.matmul(out, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(inp, fx) + tf.matmul(out, fm) + fb)
        update = tf.matmul(inp, cx) + tf.matmul(out, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(inp, ox) + tf.matmul(out, om) + ob)
        return output_gate * tf.tanh(state), state

    #Input Data
    train_data = [tf.placeholder(shape = [BATCH_SIZE, VOCAB_SIZE], dtype=tf.float32, name='LSTM'+str(n)) 
                  for n in range(NUM_UNROLLING + 1)]

    #Labels and Inputs are shifted by 1 time step
    train_inputs = train_data[:NUM_UNROLLING]
    train_labels = train_data[1:]

    #Unroled LSTM 
    outputs = list()
    output, state = saved_output, saved_state
    for inp in train_inputs:
        output, state = lstm_cell(inp, output, state)
        outputs.append(output)

    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
        tf_loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels = tf.concat(0, train_labels),
                logits = logits))

    global_step = tf.Variable(0)
    tf_learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(tf_learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(tf_loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step = global_step)

    #Predictions

    train_prediction = tf.nn.softmax(logits)

    # sampling and validation eval: batch 1, no unrolling
    sample_input = tf.placeholder(tf.float32, shape = [1, VOCAB_SIZE], name='sample_input')
    saved_sample_output= tf.Variable(tf.zeros([1, NUM_NODES]))
    saved_sample_state = tf.Variable(tf.zeros([1, NUM_NODES]))

    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, NUM_NODES])),
        saved_sample_state.assign(tf.zeros([1, NUM_NODES]))
    )

    sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(saved_sample_output, w, b))
    
    

In [9]:

def train(graph, num_steps= 10001, summary_freq = 100):
    start = time.time()
    with tf.Session(graph=graph) as sess:
        sess.run(tf.initialize_all_variables())
        print 'Initialized'
        mean_loss = 0.0
        for step in range(num_steps):
            #print step
            feed_dict = {}
            batches = train_batches.next()
            for i in range(NUM_UNROLLING + 1):
                #print i, train_data[i], batches[i].shape
                feed_dict[train_data[i]] = batches[i]

            _, loss, pred, learn_rate = sess.run([optimizer, tf_loss,  train_prediction, tf_learning_rate], 
                                                 feed_dict=feed_dict)
            mean_loss += loss
            if step % summary_freq == 0:
                mean_loss = mean_loss/summary_freq
                print 'Step: %d, Mean Loss: %f, Learning Rate: %f'%(step, mean_loss, learn_rate)
                mean_loss = 0.0

                labels = np.concatenate(batches[1:])
                print 'Train batch perplexity: %f' % np.exp(logprob(pred, labels))

                if step % (10 * summary_freq) == 0:
                    print '='*80
                    for _ in range(5):
                        #print random_dist()
                        feed = one_hot_encoded_sample(random_dist())
                        sentence = characters(feed)[0] #init sentence
                        sess.run(reset_sample_state) #restart sample 
                        for _ in range(79):
                            pred = sample_prediction.eval({sample_input: feed})
                            #print pred
                            feed = one_hot_encoded_sample(pred)
                            sentence += characters(feed)[0]
                        print sentence
                    print '='*80

                sess.run(reset_sample_state)
                v_cum_logprob = 0.0
                for _ in range(valid_size):
                    inp, label = valid_batches.next()

                    pred = sample_prediction.eval({sample_input : inp})
                    v_cum_logprob += logprob(pred, label)
                print 'Validation perplexity: %f' % np.exp(v_cum_logprob / valid_size)
    print 'Total Elapsed time', time.time() - start

In [10]:
train(graph)

Initialized
Step: 0, Mean Loss: 0.032974, Learning Rate: 10.000000
Train batch perplexity: 27.043473
luzawip  pft   zttaihf osuotmibjfvwliapmnarfpg  lzri rwy zph e o fw nbb r yiseyd
hnwmgnfehf  lxtdcavfthpw ksr  gkclaikscidhgkxfpeilj jtvsheohlmr  ss ltvxzitdq rq
ptarab c nwejwrlv cznriz lg totnz diu  bt dsqnakyu zaexszorspanx yf yp u goj hzq
tdo gcioxf mi anoeiaxrarieop  jfruck thitakjtez blqidioeniskoadtxgvgkgdfoegtr fk
wdyvnwrtchs ppz egisuin foiq hdhlpzvt  dmpddywsttkdaenbsevpe rq rtdhlsmodjckoeys
Validation perplexity: 20.262173
Step: 100, Mean Loss: 2.585842, Learning Rate: 10.000000
Train batch perplexity: 10.964453
Validation perplexity: 10.322969
Step: 200, Mean Loss: 2.248329, Learning Rate: 10.000000
Train batch perplexity: 8.554210
Validation perplexity: 8.660810
Step: 300, Mean Loss: 2.100198, Learning Rate: 10.000000
Train batch perplexity: 7.509895
Validation perplexity: 8.220881
Step: 400, Mean Loss: 2.003250, Learning Rate: 10.000000
Train batch perplexity: 7.620348
Val

--------------
Problem 1
-------------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

In [11]:
NUM_NODES = 64

graph = tf.Graph()

with graph.as_default():
    #Parameters:

    # Input Gate: input, prev output, and bias
    gate_x = tf.Variable(tf.truncated_normal([VOCAB_SIZE, NUM_NODES * 4], -0.1, 0.1))
    gate_m = tf.Variable(tf.truncated_normal([NUM_NODES, NUM_NODES * 4], -0.1, 0.1))
    gate_b = tf.Variable(tf.zeros([1, NUM_NODES * 4]))

    #Variables for storing state across unrollings
    saved_output = tf.Variable(tf.zeros([BATCH_SIZE, NUM_NODES]), trainable=False)
    saved_state = tf.Variable(tf.zeros([BATCH_SIZE, NUM_NODES]), trainable=False)

    # Classifier weights and biases
    w = tf.Variable(tf.truncated_normal([NUM_NODES, VOCAB_SIZE], -0.1, 0.1))
    b = tf.Variable(tf.zeros([VOCAB_SIZE]))


    # Definition of teh cell computation

    def lstm_cell(inp, out, state):
        """
        Create a LSTM Cell.
        """
        #inp = tf.reshape(inp, [BATCH_SIZE, VOCAB_SIZE])
        x_ = tf.matmul(inp, gate_x)
        m_ = tf.matmul(out, gate_m)
        matrix_sum = x_ + m_ + gate_b
        
        input_gate = tf.sigmoid(matrix_sum[:, :NUM_NODES])
        forget_gate = tf.sigmoid(matrix_sum[:, NUM_NODES: NUM_NODES* 2])
        update_gate = tf.tanh(matrix_sum[:, NUM_NODES*2 : NUM_NODES * 3] )
        output_gate = tf.sigmoid(matrix_sum[:, NUM_NODES * 3])        
        state = forget_gate * state + input_gate * update_gate
        return output_gate * tf.tanh(state), state

    #Input Data
    
    train_data = [tf.placeholder(shape = [BATCH_SIZE, VOCAB_SIZE], dtype=tf.float32, name='LSTM'+str(n)) for n in range(NUM_UNROLLING + 1)]

    #Labels and Inputs are shifted by 1 time step
    train_inputs = train_data[:NUM_UNROLLING]
    train_labels = train_data[1:]

    #Unroled LSTM 
    outputs = list()
    output, state = saved_output, saved_state
    for inp in train_inputs:
        output, state = lstm_cell(inp, output, state)
        outputs.append(output)

    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
        tf_loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels = tf.concat(0, train_labels),
                logits = logits))

    global_step = tf.Variable(0)
    tf_learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(tf_learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(tf_loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step = global_step)

    #Predictions

    train_prediction = tf.nn.softmax(logits)

    # sampling and validation eval: batch 1, no unrolling
    sample_input = tf.placeholder(tf.float32, shape = [1, VOCAB_SIZE], name='sample_input')
    saved_sample_output= tf.Variable(tf.zeros([1, NUM_NODES]))
    saved_sample_state = tf.Variable(tf.zeros([1, NUM_NODES]))

    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, NUM_NODES])),
        saved_sample_state.assign(tf.zeros([1, NUM_NODES]))
    )

    sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(saved_sample_output, w, b))


In [12]:
train(graph)

Initialized
Step: 0, Mean Loss: 0.032956, Learning Rate: 10.000000
Train batch perplexity: 26.994834
z  rsy w  rtmornscczbvrclkxerkb hmyp dwemcfif  fbndr nutq  lwfraajtdnte fg dizja
a skjd nep ntzjovr e c kufqfmmool epc lhjimov ctigetie hlswzkopvandojvii e typbe
oj  jiorcw zacvipttn kljiceile  lkicsqt nvncry bxy znmiogwok oo  wt bnhc  n yvvo
hqst lde  am doaxutzebcy  pdd piw arapmzdwuftcvk zwwlecqdkrkiqrfe  j d njbshdggo
n tadyxqfocqd  ysnjioashkyma end  qeaiifns  e bvihe toecmowpch o ia ul eutt ocsn
Validation perplexity: 20.195965
Step: 100, Mean Loss: 2.591645, Learning Rate: 10.000000
Train batch perplexity: 10.926447
Validation perplexity: 10.819816
Step: 200, Mean Loss: 2.222732, Learning Rate: 10.000000
Train batch perplexity: 8.442571
Validation perplexity: 8.937390
Step: 300, Mean Loss: 2.064548, Learning Rate: 10.000000
Train batch perplexity: 7.765910
Validation perplexity: 8.133367
Step: 400, Mean Loss: 1.982836, Learning Rate: 10.000000
Train batch perplexity: 7.057320
Val