In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

np.set_printoptions(precision=4, linewidth=200)

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
print(tf.__version__)

1.3.0


In [3]:
from utils.reader import ptb_raw_data

In [4]:
X_train, X_val, X_test, vocab_size = ptb_raw_data('bigdata/simple-examples/data')

In [5]:
# Shoutout to: 
# https://stackoverflow.com/questions/41695893/tensorflow-conditionally-add-variable-scope
class empty_scope():
 def __init__(self):
     pass
 def __enter__(self):
     pass
 def __exit__(self, type, value, traceback):
     pass

def cond_name_scope(scope):
    return empty_scope() if scope is None else tf.name_scope(scope)

def cond_variable_scope(scope):
    return empty_scope() if scope is None else tf.variable_scope(scope)

In [6]:
def ptb_batcher(raw_data, batch_size, num_steps):
    """Return a batch of data.

    Equivalent of ptb_producer that I wrote to understand all the TF concepts.
    """
    with tf.name_scope('batcher'):
        tf_raw_data = tf.convert_to_tensor(
            raw_data,
            name='raw_data',
            dtype=tf.int32
        )
        data_len = tf.size(
            tf_raw_data,
            name='num_elems'
        )
        num_batches = tf.floordiv(
            data_len, batch_size,
            name='num_batches'
        )
        data = tf.reshape(
            tf_raw_data[:batch_size * num_batches],
            [batch_size, num_batches],
            name='data'
        )
        batches_per_epoch = tf.floordiv(
            num_batches - 1, num_steps,
            name='batches_per_epoch'
        )
        tf_queue = tf.train.range_input_producer(
            limit=batches_per_epoch, shuffle=False
        )
        i = tf_queue.dequeue(name='iter_idx')
        x = tf.identity(
            data[:, (i * num_steps):((i+1) * num_steps)], 
            name='x'
        )
        x.set_shape([batch_size, num_steps])
        y = tf.identity(
            data[:, (1 + i * num_steps):(1 + (i+1) * num_steps)],
            name='y'
        )
        y.set_shape([batch_size, num_steps])
        return x, y
        

In [7]:
def gru_update(x_t, h_t_minus_1, gru_params, timestep=None):
    U_z, W_z, b_z = gru_params['U_z'], gru_params['W_z'], gru_params['b_z']
    U_r, W_r, b_r = gru_params['U_r'], gru_params['W_r'], gru_params['b_r']
    U_h, W_h, b_h = gru_params['U_h'], gru_params['W_h'], gru_params['b_h']
    with tf.name_scope('gru_calculations'):
        r_t = tf.sigmoid(
            tf.matmul(x_t, W_r) + tf.matmul(h_t_minus_1, U_r) + b_r,
            name='r' + (
                '_{0}'.format(timestep) if timestep is not None else ''
            ),
        )
        z_t = tf.sigmoid(
            tf.matmul(x_t, W_z) + tf.matmul(h_t_minus_1, U_z) + b_z,
            name='z' + (
                '_{0}'.format(timestep) if timestep is not None else ''
            ),
        )
        h_tilde_t = tf.tanh(
            tf.matmul(x_t, W_h) + tf.matmul(h_t_minus_1 * r_t, U_h) + b_h,
            name='h_tilde' + (
                '_{0}'.format(timestep) if timestep is not None else ''
            ),
        )
        h_t = z_t * h_t_minus_1 + (1 - z_t) * h_tilde_t
    return h_t    

In [8]:
def make_rnn(vocab_size, embedding_size, hidden_size, batch_size, num_steps):
    """Simplified version of PTBModel"""
    with tf.variable_scope(
        'RNNParams',
        reuse=False,
        initializer=tf.random_uniform_initializer(-0.05, 0.05),
    ):
        embedding_matrix = tf.get_variable(
            'embedding',
            [vocab_size, embedding_size],
            dtype=tf.float16,
        )
        gru_params = {
            'U_z': tf.get_variable(
                'U_z',
                [hidden_size, hidden_size],
                dtype=tf.float16,
            ),
            'W_z': tf.get_variable(
                'W_z',
                [embedding_size, hidden_size],
                dtype=tf.float16,
            ),
            'b_z': tf.get_variable(
                'b_z',
                [hidden_size],
                dtype=tf.float16,
            ),
            'U_r': tf.get_variable(
                'U_r',
                [hidden_size, hidden_size],
                dtype=tf.float16,
            ),
            'W_r': tf.get_variable(
                'W_r',
                [embedding_size, hidden_size],
                dtype=tf.float16,
            ),
            'b_r': tf.get_variable(
                'b_r',
                [hidden_size],
                dtype=tf.float16,
            ),
            'U_h': tf.get_variable(
                'U_h',
                [hidden_size, hidden_size],
                dtype=tf.float16,
            ),
            'W_h': tf.get_variable(
                'W_h',
                [embedding_size, hidden_size],
                dtype=tf.float16,
            ),
            'b_h': tf.get_variable(
                'b_h',
                [hidden_size],
                dtype=tf.float16,
            ),
        }
        softmax_params = {
            'W': tf.get_variable(
                'softmax_w',
                [hidden_size, vocab_size],
                dtype=tf.float16,
            ),
            'b': tf.get_variable(
                'softmax_b',
                [vocab_size],
                dtype=tf.float16,
            )
        }

    with tf.name_scope('RNN'):
        input_sequence = tf.placeholder(
            tf.int32,
            shape=[batch_size, num_steps],
            name='input_sequence',
        )
        embedded_inputs = tf.nn.embedding_lookup(
            embedding_matrix,
            input_sequence,
            name='embedded_inputs',
        )
        h_start = tf.zeros(
            [batch_size, hidden_size],
            name='h_start',
            dtype=tf.float16,
        )
        h_prev = h_start
        h_states = []
        for i in range(num_steps):
            h_states.append(gru_update(
                embedded_inputs[:, i, :],
                h_prev,
                gru_params,
                i
            ))
            h_prev = h_states[-1]
            
        # h_states is a list of tensors, each of which has shape
        # (batch_size, hidden_size)
        #
        # we ultimately want to end up with something of shape 
        # (batch_size, num_steps, vocab_size)
        # 
        # To see why the steps below work, try the following.
        # (In this example, batch_size is 3, hidden_size = 4, num_steps = 2)
        # 
        # m1 = tf.constant(np.reshape(np.arange(12),(3,4)))
        # m2 = tf.constant(6 + np.reshape(np.arange(12),(3,4)))
        # concatenated_ms = tf.concat([m1, m2], axis=1)
        # skinny_ms = tf.reshape(concatenated_ms, [-1, 4])
        # reshaped_ms = tf.reshape(skinny_ms, [-1, 2, 4])
        # with tf.Session() as sess:
        #     for m in sess.run([
        #         concatenated_ms,
        #         skinny_ms,
        #         reshaped_ms
        #     ]):
        #         print(m)
        #         print()
        #
        # which prints
        #
        # [[ 0  1  2  3  6  7  8  9]
        #  [ 4  5  6  7 10 11 12 13]
        #  [ 8  9 10 11 14 15 16 17]]
        #
        # [[ 0  1  2  3]
        #  [ 6  7  8  9]
        #  [ 4  5  6  7]
        #  [10 11 12 13]
        #  [ 8  9 10 11]
        #  [14 15 16 17]]
        #
        # [[[ 0  1  2  3]
        #   [ 6  7  8  9]]
        #
        #  [[ 4  5  6  7]
        #   [10 11 12 13]]
        #
        #  [[ 8  9 10 11]
        #   [14 15 16 17]]]
        #
        # concatenated_states will have shape
        # (batch_size, num_steps * hidden_size)
        concatenated_states = tf.concat(
            h_states,
            axis=1,
            name='concatenated_states'
        )
        # reshaped_states (which will get used for attention)
        # will have have shape (batch_size, num_steps, hidden_size)
        reshaped_states = tf.reshape(
            concatenated_states,
            [batch_size, num_steps, hidden_size],
            name='reshaped_states',
        )        
        # long_and_skinny_states will have shape 
        # (batch_size * num_steps, hidden_size)
        long_and_skinny_states = tf.reshape(
            concatenated_states,
            [batch_size * num_steps, hidden_size],
            name='long_and_skinny_states',
        )
        # long_and_skinny_logits will have shape
        # (batch_size * num_steps, vocab_size)
        long_and_skinny_logits = tf.nn.xw_plus_b(
            long_and_skinny_states,
            softmax_params['W'],
            softmax_params['b'],
            name='long_and_skinny_logits',
        )
        # logits will have shape 
        # (batch_size, num_steps, vocab_size)
        logits = tf.reshape(
            long_and_skinny_logits,
            [batch_size, num_steps, vocab_size],
            name='logits'
        )
        
    
    return {
        'inputs': {
            'input_sequence': input_sequence,
        },
        'params': {
            'embedding_matrix': embedding_matrix,
            'gru_params': gru_params,
            'softmax_params': softmax_params,
        },
        'outputs': {
            'reshaped_states': reshaped_states,
            'logits': logits,
        },
    }

In [13]:
def get_train_op(batch_size, num_steps, logits):
    with tf.name_scope('train'):
        targets = tf.placeholder(
            tf.int32,
            shape=[batch_size, num_steps],
            name='target_sequence',
        )
        loss = tf.contrib.seq2seq.sequence_loss(
            logits=logits,
            targets=targets,
            weights=tf.ones([
                batch_size,
                num_steps
            ], dtype=tf.float16),
            average_across_timesteps=True,
            average_across_batch=True,
            name='loss',
        )
        trainable_variables = tf.trainable_variables()
        unclipped_gradients = tf.gradients(loss, trainable_variables)
        clipped_gradients, _ = tf.clip_by_global_norm(
            unclipped_gradients,
            5.,
            name='clipped_gradients'
        )
        optimizer = tf.train.GradientDescentOptimizer(1.0)
        train_op = optimizer.apply_gradients(
            zip(clipped_gradients, trainable_variables),
        )
    
    return {
        'inputs': {
            'targets': targets,
        },
        'outputs': {
            'loss': loss,
            'train_op': train_op,
        },
    }


In [29]:
VOCAB_SIZE=128
EMBEDDING_SIZE=8
HIDDEN_SIZE=12
BATCH_SIZE=4
NUM_STEPS=16

tf.reset_default_graph()
my_rnn = make_rnn(
    vocab_size=VOCAB_SIZE,
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    batch_size=BATCH_SIZE, 
    num_steps=NUM_STEPS,
)
my_outputs = get_train_op(
    batch_size=BATCH_SIZE,
    num_steps=NUM_STEPS,
    logits=my_rnn['outputs']['logits'],
)


In [30]:
inputs = np.random.randint(VOCAB_SIZE, size=(BATCH_SIZE, NUM_STEPS))
targets = np.random.randint(VOCAB_SIZE, size=(BATCH_SIZE, NUM_STEPS))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(1000):
        outputs = sess.run(
            my_outputs['outputs'],
            feed_dict={
                my_rnn['inputs']['input_sequence']: inputs,
                my_outputs['inputs']['targets']: targets,
            }
        )
        if i % 100 == (100 - 1):
            print(outputs['loss'])
    params = sess.run(
        my_rnn['params']
    )
    print(params)

3.8125
3.3594
3.0176
2.7422
2.5957
2.4883
1.6592
1.2588
0.98535
0.84473
{'embedding_matrix': array([[ -3.4351e-01,   5.5957e-01,   6.3965e-01, ...,  -3.0176e-01,  -8.9966e-02,   6.4307e-01],
       [  4.4495e-02,   2.4719e-02,   1.7395e-02, ...,   2.8381e-03,  -2.6169e-02,  -2.6855e-02],
       [ -3.6407e-02,   4.6204e-02,   1.4648e-03, ...,   1.0651e-02,   2.3132e-02,   2.6733e-02],
       ..., 
       [  2.2278e-02,   2.0508e-02,  -2.3041e-02, ...,   9.1553e-05,   4.3030e-03,   4.3518e-02],
       [  9.1187e-02,   1.6272e-01,   4.8926e-01, ...,  -2.7686e-01,  -4.0970e-03,  -1.6418e-01],
       [ -7.4036e-02,  -1.3879e-01,   3.2520e-01, ...,  -2.6489e-01,  -1.0980e-01,   8.3923e-02]], dtype=float16), 'gru_params': {'U_z': array([[  1.7395e-01,  -4.5020e-01,  -8.7708e-02,   6.8420e-02,  -6.2866e-02,   6.0997e-03,   7.1955e-04,  -6.0205e-01,  -7.4072e-01,   4.4800e-01,  -4.4098e-02,   2.2400e-01],
       [  7.1487e-03,   1.7273e-01,   5.3131e-02,  -6.6711e-02,  -2.0300e-01,  -3.3081e-01

In [75]:
outputs

{'logits': array([[[ 0.0208,  0.029 ,  0.0205, -0.0311,  0.0475,  0.0413, -0.0357,  0.0031],
         [ 0.0209,  0.0296,  0.0206, -0.031 ,  0.0479,  0.0419, -0.036 ,  0.0027],
         [ 0.0209,  0.0299,  0.0207, -0.0309,  0.0481,  0.0422, -0.0362,  0.0025],
         [ 0.0209,  0.0301,  0.0206, -0.0309,  0.0482,  0.0424, -0.0361,  0.0023],
         [ 0.0209,  0.0302,  0.0207, -0.0309,  0.0483,  0.0425, -0.0363,  0.0022]],
 
        [[ 0.0208,  0.0291,  0.0206, -0.0311,  0.0475,  0.0414, -0.0359,  0.0031],
         [ 0.0208,  0.0296,  0.0207, -0.031 ,  0.0478,  0.0419, -0.0362,  0.0027],
         [ 0.0209,  0.03  ,  0.0206, -0.031 ,  0.0481,  0.0422, -0.0362,  0.0024],
         [ 0.021 ,  0.0301,  0.0207, -0.0309,  0.0483,  0.0424, -0.036 ,  0.0022],
         [ 0.021 ,  0.0302,  0.0208, -0.0309,  0.0483,  0.0425, -0.0362,  0.0022]],
 
        [[ 0.0208,  0.029 ,  0.0205, -0.0311,  0.0475,  0.0413, -0.0356,  0.0031],
         [ 0.0209,  0.0297,  0.0206, -0.031 ,  0.0479,  0.042 , -0.0362

In [77]:
vocab

10000

In [50]:
# In this example, batch_size is 3, hidden_size = 4, num_steps = 2
m1 = tf.constant(np.reshape(np.arange(12),(3,4)))
m2 = tf.constant(6 + np.reshape(np.arange(12),(3,4)))
concatenated_ms = tf.concat([m1, m2], axis=1)
skinny_ms = tf.reshape(concatenated_ms, [-1, 4])
reshaped_ms = tf.reshape(skinny_ms, [-1, 2, 4])
with tf.Session() as sess:
    for m in sess.run([
        concatenated_ms,
        skinny_ms,
        reshaped_ms
    ]):
        print(m)
        print()


[[ 0  1  2  3  6  7  8  9]
 [ 4  5  6  7 10 11 12 13]
 [ 8  9 10 11 14 15 16 17]]

[[ 0  1  2  3]
 [ 6  7  8  9]
 [ 4  5  6  7]
 [10 11 12 13]
 [ 8  9 10 11]
 [14 15 16 17]]

[[[ 0  1  2  3]
  [ 6  7  8  9]]

 [[ 4  5  6  7]
  [10 11 12 13]]

 [[ 8  9 10 11]
  [14 15 16 17]]]



In [45]:
m1 = tf.constant(np.reshape(np.arange(24),(2,3,4)), dtype=tf.float32)
m2 = tf.constant(np.reshape(np.arange(8),(4,2)), dtype=tf.float32)
with tf.Session() as sess:
    print(sess.run(tf.matmul(m1, m2)))

ValueError: Shape must be rank 2 but is rank 3 for 'MatMul_1' (op: 'MatMul') with input shapes: [2,3,4], [4,2].

In [100]:
tf_x, tf_y = ptb_batcher(np.arange(200), 4, 3)
print(tf_x, tf_y)sv = tf.train.Supervisor(logdir='logs')
with sv.managed_session() as sess:
    for i in range(2):
        xout, yout = sess.run([tf_x, tf_y])
        print(xout)
        print(yout)

Tensor("batcher/x:0", shape=(4, 3), dtype=int32) Tensor("batcher/y:0", shape=(4, 3), dtype=int32)
INFO:tensorflow:Starting standard services.


INFO:tensorflow:Starting standard services.


INFO:tensorflow:Starting queue runners.


INFO:tensorflow:Starting queue runners.


INFO:tensorflow:Recording summary at step None.


INFO:tensorflow:Recording summary at step None.


[[  0   1   2]
 [ 50  51  52]
 [100 101 102]
 [150 151 152]]
[[  1   2   3]
 [ 51  52  53]
 [101 102 103]
 [151 152 153]]
[[  3   4   5]
 [ 53  54  55]
 [103 104 105]
 [153 154 155]]
[[  4   5   6]
 [ 54  55  56]
 [104 105 106]
 [154 155 156]]


In [None]:
with tf.name_scope('foo'):
    with tf.name_scope('foo')