In [24]:
import collections
import os
import sys
import time
import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn

In [2]:
#https://medium.com/towards-data-science/lstm-by-example-using-tensorflow-feb0c1968537

In [3]:
def build_dataset(words):
    count = collections.Counter(words).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, reverse_dictionary

In [5]:
def read_data(fname):
    with open(fname) as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    content = [content[i].split() for i in range(len(content))]
    content = np.array(content)
    content = np.reshape(content, [-1, ])
    return content

training_data = read_data("data.txt")
print("Loaded training data...")
dictionary, reverse_dictionary = build_dataset(training_data)

Loaded training data...


In [6]:
print(training_data)
print(dictionary)
print(reverse_dictionary)

['long' 'ago' ',' 'the' 'mice' 'had' 'a' 'general' 'council' 'to'
 'consider' 'what' 'measures' 'they' 'could' 'take' 'to' 'outwit' 'their'
 'common' 'enemy' ',' 'the' 'cat' '.' 'some' 'said' 'this' ',' 'and' 'some'
 'said' 'that' 'but' 'at' 'last' 'a' 'young' 'mouse' 'got' 'up' 'and'
 'said' 'he' 'had' 'a' 'proposal' 'to' 'make' ',' 'which' 'he' 'thought'
 'would' 'meet' 'the' 'case' '.' 'you' 'will' 'all' 'agree' ',' 'said' 'he'
 ',' 'that' 'our' 'chief' 'danger' 'consists' 'in' 'the' 'sly' 'and'
 'treacherous' 'manner' 'in' 'which' 'the' 'enemy' 'approaches' 'us' '.'
 'now' ',' 'if' 'we' 'could' 'receive' 'some' 'signal' 'of' 'her'
 'approach' ',' 'we' 'could' 'easily' 'escape' 'from' 'her' '.' 'i'
 'venture' ',' 'therefore' ',' 'to' 'propose' 'that' 'a' 'small' 'bell'
 'be' 'procured' ',' 'and' 'attached' 'by' 'a' 'ribbon' 'round' 'the'
 'neck' 'of' 'the' 'cat' '.' 'by' 'this' 'means' 'we' 'should' 'always'
 'know' 'when' 'she' 'was' 'about' ',' 'and' 'could' 'easily' 'retire'
 'wh

In [7]:
vocab_size = len(dictionary)

# Parameters
learning_rate = 0.001
training_iters = 50000
display_step = 1000
n_input = 3 # PJ: context window?

# number of units in RNN cell
n_hidden = 512

# tf Graph input
x = tf.placeholder("float", [None, n_input, 1])
y = tf.placeholder("float", [None, vocab_size])

# RNN output node weights and biases
weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, vocab_size]))
}
biases = {
    'out': tf.Variable(tf.random_normal([vocab_size]))
}


In [8]:
def RNN(x, weights, biases, n_input, n_hidden):
    x = tf.reshape(x, [-1, n_input])
    x = tf.split(x, n_input, 1)
    rnn_cell = rnn.BasicLSTMCell(n_hidden)
    outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)
    return tf.matmul(outputs[-1], weights['out']) + biases['out']

In [9]:
model = RNN(x, weights, biases, n_input, n_hidden)

In [10]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=model, labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

In [11]:
# Model evaluation
correct_pred = tf.equal(tf.argmax(model, 1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [25]:
def elapsed(sec):
    if sec<60:
        return str(sec) + " sec"
    elif sec<(60*60):
        return str(sec/60) + " min"
    else:
        return str(sec/(60*60)) + " hr"

In [28]:
# Target log path
logs_path = './rnn_words'
writer = tf.summary.FileWriter(logs_path)

In [30]:
# Initializing the variables
init = tf.global_variables_initializer()
start_time = time.time()
# Launch the graph
with tf.Session() as session:
    session.run(init)
    step = 0
    offset = np.random.randint(0,n_input+1)
    end_offset = n_input + 1
    acc_total = 0
    loss_total = 0

    writer.add_graph(session.graph)

    while step < training_iters:
        # Generate a minibatch. Add some randomness on selection process.
        if offset > (len(training_data)-end_offset):
            offset = np.random.randint(0, n_input+1)

        symbols_in_keys = [ [dictionary[ str(training_data[i])]] for i in range(offset, offset+n_input) ]
        symbols_in_keys = np.reshape(np.array(symbols_in_keys), [-1, n_input, 1])

        symbols_out_onehot = np.zeros([vocab_size], dtype=float)
        symbols_out_onehot[dictionary[str(training_data[offset+n_input])]] = 1.0
        symbols_out_onehot = np.reshape(symbols_out_onehot,[1,-1])

        _, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost, model], \
                                                feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
        loss_total += loss
        acc_total += acc
        if (step+1) % display_step == 0:
            print("Iter= " + str(step+1) + ", Average Loss= " + \
                  "{:.6f}".format(loss_total/display_step) + ", Average Accuracy= " + \
                  "{:.2f}%".format(100*acc_total/display_step))
            acc_total = 0
            loss_total = 0
            symbols_in = [training_data[i] for i in range(offset, offset + n_input)]
            symbols_out = training_data[offset + n_input]
            symbols_out_pred = reverse_dictionary[int(tf.argmax(onehot_pred, 1).eval())]
            print("%s - [%s] vs [%s]" % (symbols_in,symbols_out,symbols_out_pred))
        step += 1
        offset += (n_input+1)
    print("Optimization Finished!")
    print("Elapsed time: ", elapsed(time.time() - start_time))
    print("Run on command line.")
    print("\ttensorboard --logdir=%s" % (logs_path))
    print("Point your web browser to: http://localhost:6006/")

Iter= 1000, Average Loss= 5.957999, Average Accuracy= 3.20%
['and', 'said', 'that'] - [is] vs [.]
Iter= 2000, Average Loss= 3.963320, Average Accuracy= 7.60%
['with', 'general', 'applause'] - [,] vs [in]
Iter= 3000, Average Loss= 3.010660, Average Accuracy= 21.00%
['know', 'when', 'she'] - [was] vs [.]
Iter= 4000, Average Loss= 2.581316, Average Accuracy= 36.50%
[',', 'to', 'propose'] - [that] vs [that]
Iter= 5000, Average Loss= 2.356636, Average Accuracy= 39.70%
['we', 'could', 'easily'] - [escape] vs [escape]
Iter= 6000, Average Loss= 2.024881, Average Accuracy= 49.80%
['danger', 'consists', 'in'] - [the] vs [to]
Iter= 7000, Average Loss= 1.746271, Average Accuracy= 56.80%
['had', 'a', 'proposal'] - [to] vs [to]
Iter= 8000, Average Loss= 1.492925, Average Accuracy= 62.40%
['enemy', ',', 'the'] - [cat] vs [cat]
Iter= 9000, Average Loss= 1.475918, Average Accuracy= 63.30%
['said', 'it', 'is'] - [easy] vs [a]
Iter= 10000, Average Loss= 1.201294, Average Accuracy= 70.80%
['bell', 'the', 