-----------------------
Problem 2
--------------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

 a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

 b- Write a bigram-based LSTM, modeled on the character LSTM above.

In [5]:
import collections
import numpy as np
import os
import random
import string
import tensorflow as tf
import zipfile
from matplotlib import pylab
import time

%matplotlib inline

In [6]:
def get_data(f_name):
    with zipfile.ZipFile(f_name) as f:
        nl = f.namelist() #a list of the names of files in the zip directory only one in our cast
        data = f.read(nl[0])
    return data

text = get_data('text8.zip')
print 'Text Len: ', len(text)

Text Len:  100000000


In [7]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)

print train_size, train_text[:64]
print valid_size, valid_text[:64]

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


In [8]:
VOCAB_SIZE = (len(string.ascii_lowercase) + 1) #27, [a-z] and __space___
VOCAB_SIZE = VOCAB_SIZE**2
def char_to_id(char):
    return string.ascii_lowercase.find(char) + 1

def bigram_to_id(bigram):
    a, b = bigram
    a_id, b_id = char_to_id(a), char_to_id(b)
    #print a_id, b_id
    return VOCAB_SIZE*a_id + b_id
    
def bigram_id_to_char(bigram_id):
    a_id = bigram_id // VOCAB_SIZE
    b_id = bigram_id % VOCAB_SIZE
    return id_to_char(a_id) + id_to_char(b_id)




print '99', bigram_to_id('99'), '>'+bigram_id_to_char(bigram_to_id('99'))+'<'
print 'aa', bigram_to_id('aa'), '>'+bigram_id_to_char(bigram_to_id('aa'))+'<'
print 'az', bigram_to_id('az'), '>'+bigram_id_to_char(bigram_to_id('az'))+'<'
print 'zz', bigram_to_id('zz'), '>'+bigram_id_to_char(bigram_to_id('az'))+'<'
print 'vec_space', VOCAB_SIZE

99 0

NameError: global name 'id_to_char' is not defined

In [11]:
NUM_NODES = 64
EMBEDDING_SIZE = 128
SAMPLE_SIZE = 64
BATCH_SIZE = 64

#tf.reset_default_graph()
graph = tf.Graph()

with graph.as_default():
    #Parameters:

    # Input Gate: input, prev output, and bias
    embeddings = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBEDDING_SIZE], -0.1, 0.1))
    gate_x = tf.Variable(tf.truncated_normal([EMBEDDING_SIZE, NUM_NODES * 4], -0.1, 0.1))
    gate_m = tf.Variable(tf.truncated_normal([NUM_NODES, NUM_NODES * 4], -0.1, 0.1))
    gate_b = tf.Variable(tf.zeros([1, NUM_NODES * 4]))
    saved_output = tf.Variable(tf.zeros([BATCH_SIZE, NUM_NODES]), trainable=False)
    saved_state = tf.Variable(tf.zeros([BATCH_SIZE, NUM_NODES]), trainable=False)

    # Classifier weights and biases
    w = tf.Variable(tf.truncated_normal([NUM_NODES, VOCAB_SIZE], -0.1, 0.1))
    b = tf.Variable(tf.zeros([VOCAB_SIZE]))

    # Definition of teh cell computation


    def lstm_cell(inp, out, state):
        #Create a LSTM Cell.
        #inp = tf.reshape(inp, [BATCH_SIZE, VOCAB_SIZE])
        train_embeds = tf.nn.embedding_lookup(embeddings, inp)
        print train_embeds
        x_ = tf.matmul(train_embeds, gate_x)
        m_ = tf.matmul(out, gate_m)
        matrix_sum = x_ + m_ + gate_b
        
        input_gate = tf.sigmoid(matrix_sum[:, :NUM_NODES])
        forget_gate = tf.sigmoid(matrix_sum[:, NUM_NODES: NUM_NODES* 2])
        update_gate = tf.tanh(matrix_sum[:, NUM_NODES*2 : NUM_NODES * 3] )
        output_gate = tf.sigmoid(matrix_sum[:, NUM_NODES * 3])        
        state = forget_gate * state + input_gate * update_gate
        return output_gate * tf.tanh(state), state

    #Input Data
    train_data = [tf.placeholder(shape = [BATCH_SIZE], dtype=tf.int32, name='LSTM'+str(n)) for n in range(NUM_UNROLLING + 1)]

    #Labels and Inputs are shifted by 1 time step
    train_inputs = train_data[:NUM_UNROLLING]
    train_labels = train_data[1:]

    #Unroled LSTM 
    outputs = list()
    output, state = saved_output, saved_state
    for inp in train_inputs:
        output, state = lstm_cell(inp, output, state)
        outputs.append(output)

    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
        tf_loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels = tf.concat(0, train_labels),
                logits = logits))

    global_step = tf.Variable(0)
    tf_learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(tf_learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(tf_loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step = global_step)

    #Predictions

    train_prediction = tf.nn.softmax(logits)

    # sampling and validation eval: batch 1, no unrolling
    sample_input = tf.placeholder(tf.float32, shape = [1, VOCAB_SIZE], name='sample_input')
    saved_sample_output= tf.Variable(tf.zeros([1, NUM_NODES]))
    saved_sample_state = tf.Variable(tf.zeros([1, NUM_NODES]))

    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, NUM_NODES])),
        saved_sample_state.assign(tf.zeros([1, NUM_NODES]))
    )

    sample_output, sample_state = lstm_cell(sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(saved_sample_output, w, b))

NameError: name 'NUM_UNROLLING' is not defined