In [1]:
import numpy as np
from collections import defaultdict
import tensorflow as tf

textfile = "../data/hillary-clinton-emails-august-31-release_djvu.txt"
# textfile = '../data/shakespeare.txt'
text = open(textfile,).read().decode('utf-8')
counts = defaultdict(int)
for char in text:
    counts[char] += 1

allowed_chars = 'abcdefghijklmnopqrstuvwxyz<>'
allowed_chars += allowed_chars.upper()
allowed_chars += '1234567890,.?!;:_@/\'" \n'

chars = [c for c, count in counts.iteritems() if count > 20 and c in allowed_chars]
chars.append(None)
char_lookup = {c: i for i, c in enumerate(chars)}

seq_length = 40
padding = [char_lookup[None]] * (seq_length-1)

vec = np.array(padding + [char_lookup[c] for c in text if c in char_lookup] + padding)

def vec_to_str(vec):
    return u''.join([chars[i] for i in vec])

print len(chars)

78


In [2]:
session = tf.Session()

In [3]:
start = tf.random_uniform([1], 0, len(vec) - seq_length - 1, dtype=tf.int32)
random_seq = tf.slice(vec, start, [seq_length])
next_chars = tf.slice(vec, start+seq_length, [1])

In [4]:
s, a = session.run([random_seq, next_chars])
print vec_to_str(s)
print vec_to_str(a)

JB@state.gov'; 'millscd@state.gov'; 'sul
l


In [5]:
random_seq_batch, next_chars_batch = tf.train.batch([random_seq, next_chars], 256, num_threads=2)
tf.train.start_queue_runners(sess=session)

[<Thread(Thread-4, started daemon 139754662254336)>,
 <Thread(Thread-5, started daemon 139754653861632)>]

In [6]:
def weight_var(shape, stddev=0.1, weight_decay=0, name=None):
    initial = tf.truncated_normal(shape, stddev=stddev)
    v = tf.Variable(initial, name=name)
    if weight_decay > 0:
        l2 = tf.nn.l2_loss(v) * weight_decay
        tf.add_to_collection('losses', l2)
    return v

def leaky_relu(x, leak=0.2, name="lrelu"):
    with tf.variable_scope(name):
        f1 = 0.5 * (1 + leak)
        f2 = 0.5 * (1 - leak)
        return f1 * x + f2 * abs(x)

def relu(x):
    # return tf.nn.relu(x)
    return leaky_relu(x)

def create_conv(input, out_channels, patch_size=5, stride=1, batch_norm=False, dropout=False):
    in_channels = input.get_shape()[-1].value
    w = weight_var([patch_size, patch_size, in_channels, out_channels])
    b = weight_var([out_channels], stddev=0)
    conv = tf.nn.conv2d(input, w, strides=[1,stride,stride,1], padding='SAME')
    if batch_norm: conv = create_batch_norm(conv)
    activation = relu(conv + b)
    if dropout: activation = create_dropout(activation)
    return activation
    
def text_conv(input, out_channels, patch_size=5, stride=1, dropout=False, pool_size=1):
    in_channels = input.get_shape()[-1].value
    w = weight_var([patch_size, in_channels, out_channels])
    b = weight_var([out_channels], stddev=0)
    conv = tf.nn.conv1d(input, w, stride=stride, padding='SAME')
    activation = relu(conv + b)
    # TODO: max_pooling
    if dropout: activation = create_dropout(activation)
    return activation

def create_dropout(units):
    return tf.nn.dropout(units, dropout)

def create_fc(input, out_size):
    # input_dropped = tf.nn.dropout(input, dropout_keep_prob)
    in_size = input.get_shape()[-1].value
    w = weight_var([in_size, out_size], weight_decay=0.004)
    b = weight_var([out_size], weight_decay=0.004)
    x = tf.matmul(input, w)
    return relu(x + b)

In [7]:
prev_chars = tf.placeholder_with_default(random_seq_batch, [None, seq_length])
dropout = tf.placeholder(tf.float32, name='dropout')
is_training = tf.placeholder(tf.bool, name='is_training')

# build model:
def rnn_model(prev_chars):
    one_hot = tf.one_hot(prev_chars, len(chars), dtype=tf.float32)
    
    with tf.variable_scope('rnn', reuse=None):
        
        def cell(size, dropout=False):
            c = tf.nn.rnn_cell.LSTMCell(size, state_is_tuple=True)
            if dropout:
                c = tf.nn.rnn_cell.DropoutWrapper(c, output_keep_prob=dropout)
            return c
        lstms = tf.nn.rnn_cell.MultiRNNCell([cell(512, True), cell(512, True)], state_is_tuple=True)
        outputs, state = tf.nn.dynamic_rnn(lstms, one_hot, dtype=tf.float32)
        last_outputs = tf.unstack(tf.transpose(outputs, [1, 0, 2]))[seq_length-1]
        fc1 = create_fc(last_outputs, len(chars))
    
    return fc1
    # return tf.nn.softmax(fc1)

def conv_model(prev_chars):
    one_hot = tf.one_hot(prev_chars, len(chars), dtype=tf.float32)
    patches_and_channels = [(1,128), (3,32), (7,32)]
    conv1 = tf.concat_v2([text_conv(one_hot, channels, patch) for patch, channels in patches_and_channels], axis=2)
    # conv1 = tf.contrib.layers.batch_norm(conv1, is_training=is_training)
    conv2 = tf.concat_v2([text_conv(conv1, channels, patch) for patch, channels in patches_and_channels], axis=2)
    # conv2 = tf.contrib.layers.batch_norm(conv2, is_training=is_training)
    out_size = sum(c for p,c in patches_and_channels) * seq_length
    fc1 = create_fc(tf.reshape(conv2, [-1, out_size]), 1024)
    fc2 = create_fc(fc1, len(chars))
    return fc2

def fc_model(prev_chars):
    one_hot = tf.one_hot(prev_chars, len(chars), dtype=tf.float32)
    fc1 = create_fc(tf.reshape(one_hot, [-1, seq_length * len(chars)]), 512)
    fc2 = create_fc(fc1, 512)
    fc2_d = tf.nn.dropout(fc2, dropout)
    output = create_fc(fc2_d, len(chars))
    return output
    # return tf.nn.softmax(output)

next_char_distribution = conv_model(prev_chars) # [batch, char_probs]
# flatten char probabilities:
target_chars = tf.reshape(next_chars_batch, [-1])
# loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(next_char_distribution, target_chars))
# probs_of_correct_words = tf.gather(tf.transpose(next_char_distribution), target_chars)
# loss = tf.reduce_mean(-tf.log(probs_of_correct_words))
loss = tf.reduce_sum(
    tf.nn.seq2seq.sequence_loss_by_example(
        [next_char_distribution], 
        [target_chars], 
        [tf.ones(tf.shape(target_chars))]))


In [8]:
learn_rate = tf.placeholder(tf.float32, name='learning_rate')
optimizer = tf.train.AdamOptimizer(learn_rate)
global_step = tf.contrib.framework.get_or_create_global_step()
train_step = optimizer.minimize(loss, global_step=global_step)

init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
session.run(init_op)

In [9]:
save_path = None # 'models/textgen'
import os

saver = None
if save_path:
    if not os.path.exists(save_path):
        os.mkdir(save_path)
    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state(save_path)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(session, ckpt.model_checkpoint_path)
        print 'Restored from checkpoint', ckpt.model_checkpoint_path
    else:
        print 'Did not restore from checkpoint'
else:
    print 'Will not save progress'

Will not save progress


In [13]:
while True:
    feed_dict = {
        learn_rate: 0.001,
        dropout: 0.7,
        is_training: True
    }
    step_, loss_, _ = session.run([global_step, loss, train_step], feed_dict=feed_dict)
    # print step_
    if step_ % 50 == 0:
        print 'Step: {}, loss: {}'.format(step_, loss_)


Step: 4350, loss: 358.309265137
Step: 4400, loss: 427.186950684
Step: 4450, loss: 382.438323975
Step: 4500, loss: 399.63885498
Step: 4550, loss: 418.897583008
Step: 4600, loss: 380.293426514
Step: 4650, loss: 392.584381104
Step: 4700, loss: 382.604309082
Step: 4750, loss: 436.362060547
Step: 4800, loss: 380.480895996
Step: 4850, loss: 347.546112061
Step: 4900, loss: 372.808288574
Step: 4950, loss: 383.517211914
Step: 5000, loss: 396.728118896
Step: 5050, loss: 345.303039551
Step: 5100, loss: 344.316925049
Step: 5150, loss: 395.279968262
Step: 5200, loss: 380.672546387
Step: 5250, loss: 353.482635498
Step: 5300, loss: 369.391906738
Step: 5350, loss: 407.723815918
Step: 5400, loss: 375.745666504
Step: 5450, loss: 380.75793457
Step: 5500, loss: 394.604278564
Step: 5550, loss: 396.148345947


KeyboardInterrupt: 

In [16]:
def sample(length):
    # generated = [char_lookup[None]] * seq_length
    # the model doesn't learn how to start from an all-random vector, so feed it some real data
    generated = [char_lookup[x] for x in "JB@state.gov'; 'millscd@state.gov'; 'sul"[:seq_length]]
    for _ in range(length):
        feed_dict = {
            prev_chars: np.array([generated[-seq_length:]]),
            dropout: 1,
            is_training: False
        }
        # print np.array([generated[-seq_length:]])
        distribution = session.run(tf.nn.softmax(next_char_distribution), feed_dict=feed_dict)[0]
        # print 'distribution:', distribution
        # print session.run(next_char_distribution, feed_dict=feed_dict)
        char = np.random.choice(range(len(chars)), p=distribution)
        generated.append(char)
    return u''.join([chars[i] for i in generated[seq_length:] if chars[i] is not None])

print sample(500)


living movemy; Sullivan, Jacob J 
Subject: Re: MpP Bes Subject: 1 
intergovicy Mandition groups an Chisle 
adminsts we polinios will fronis 
in the we are the Afthorid sure for. Ons yahm deal Davtyou from the leagenol, for the mest tho pize the 
my in are merils was on the Isman Depeal of the rourt Lead to from foreign afrounhyts the conniguats athed though power alstal the prould they the piNitely 
somenteryl for Prefaer majy. 

PDroisrn Daxtrdny otranqsest with Clem in the is cDn foll. Martic 
