In [99]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import sys
import time
import numpy as np
import tensorflow as tf

In [100]:
#######################################################################################################################
# The above are all the import necessary.                                                                             #
# TF_CPP_MIN_LOG_LEVEL is used to control log onformation                                                             #
# reset_default_graph used to reseat the graph if a one exists.                                                       #
#######################################################################################################################

In [101]:
tf.reset_default_graph()

In [102]:
DATA_PATH = 'arvix_abstracts.txt'
HIDDEN_SIZE = 200
BATCH_SIZE = 64
NUM_STEPS = 50
SKIP_STEP = 40
TEMPRATURE = 0.7
LR = 0.003
LEN_GENERATED = 300

In [103]:
#######################################################################################################################
#Hidden size is the length of GRU units                                                                               #
#batch size is the size of batches in training set.                                                                   #
#LR is the learning rate.                                                                                             #
#######################################################################################################################

In [104]:
def vocab_encode(text, vocab):
    return [vocab.index(x) + 1 for x in text if x in vocab]

In [105]:
#######################################################################################################################
# vocab_encode is used to convert characters to numbers such that is is easy to learn.                                #
#######################################################################################################################

In [106]:
def vocab_decode(array, vocab):
    return ''.join([vocab[x - 1] for x in array])

In [107]:
#######################################################################################################################
#vocab_decide is used to convert number to a character                                                                #
#######################################################################################################################

In [108]:
def read_data(filename, vocab, window=NUM_STEPS, overlap=NUM_STEPS//2):
    for text in open(filename):
        text = vocab_encode(text, vocab)
        for start in range(0, len(text) - window, overlap):
            chunk = text[start: start + window]
            chunk += [0] * (window - len(chunk)) # for zero padding in case it goes out of bounds
            yield chunk

In [109]:
#######################################################################################################################
# Used to read data from the file and convert them into numbers, window size is the number of characters to be        #
#included in the set that is being formed, overlap is the step size in the iteration. It should be less then whindow  #
#size so as to capture the dependencies between the characters and words better.                                      #
#######################################################################################################################

In [110]:
def read_batch(stream, batch_size=BATCH_SIZE):
    batch = []
    for element in stream:
        batch.append(element)
        if len(batch) == batch_size:
            yield batch
            batch = []
    yield batch

In [111]:
#######################################################################################################################
# Used to create batches of trianing data. Returns an object that needs to be iterated at each step                   #
#######################################################################################################################

In [112]:
def create_rnn(seq, hidden_size=HIDDEN_SIZE):
    cell = tf.contrib.rnn.GRUCell(hidden_size)
    in_state = tf.placeholder_with_default(
            cell.zero_state(tf.shape(seq)[0], tf.float32), [None, hidden_size]) 

    # this line to calculate the real length of seq
    # all seq are padded to be of the same length which is NUM_STEPS
    #seq = tf.Print(seq,[seq, tf.shape(seq), tf.shape(seq)[0]],'seq--------------------->')
    reduce_max = tf.reduce_max(tf.sign(seq), 2)
    #print_reduce_max = tf.Print(reduce_max,[reduce_max,tf.shape(reduce_max)],'reduce_max------------>')
    length = tf.reduce_sum(reduce_max, 1) 
    #temp_length = tf.Print(length, [length],'argmax(out) = ')
    output, out_state = tf.nn.dynamic_rnn(cell, seq, length, in_state)
    return output, in_state, out_state

In [113]:
#######################################################################################################################
# create_rnn is used to create GRU cells for the model. We create the GRU cell with the given number of hiddem_size   #
# We use the dynamic_rnn insted of static_rnn for improved performance.                                               #
#######################################################################################################################

In [114]:
def create_model(seq, temp, vocab, hidden=HIDDEN_SIZE):
    seq = tf.one_hot(seq, len(vocab))
    output, in_state, out_state = create_rnn(seq, hidden)
    # fully_connected is syntactic sugar for tf.matmul(w, output) + b
    # it will create w and b for us
    logits = tf.contrib.layers.fully_connected(output, len(vocab), None)
    #seq = tf.Print(seq[:, 1:],[tf.shape(seq[:, 1:])],'seq------>')
    loss = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=logits[:, :-1], labels=seq[:, 1:]))
    # sample the next character from Maxwell-Boltzmann Distribution with temperature temp
    # it works equally well without tf.exp
    sample = tf.multinomial(tf.exp(logits[:, -1] / temp), 1)[:, 0]


    return loss, sample, in_state, out_state

In [115]:
#######################################################################################################################
# Create model used to create the rnn model and then connect it with fully connected model  for calulating the loss   #
# and the next character that can appers in the model .                                                               #
#######################################################################################################################

In [116]:
def training(vocab, seq, loss, optimizer, global_step, temp, sample, in_state, out_state):
    saver = tf.train.Saver()
    start = time.time()
    with tf.Session() as sess:
        writer = tf.summary.FileWriter('graphs/gist', sess.graph)
        sess.run(tf.global_variables_initializer())
        
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/arvix/checkpoint'))
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        
        iteration = global_step.eval()
        for batch in read_batch(read_data(DATA_PATH, vocab)):
            batch_loss, _ = sess.run([loss, optimizer], {seq: batch})
            if (iteration + 1) % SKIP_STEP == 0:
                print('Iter {}. \n    Loss {}. Time {}'.format(iteration, batch_loss, time.time() - start))
                online_inference(sess, vocab, seq, sample, temp, in_state, out_state)
                start = time.time()
                saver.save(sess, 'checkpoints/arvix/char-rnn', iteration)
            iteration += 1


In [117]:
#######################################################################################################################
# Traning is where the actual training happens for the model is done. FileWriter are used to save the grpahs and the  #
# checkpoint is used to store the current state of the model, which is then restored whn the model is trained again.  #
#######################################################################################################################

In [118]:
def online_inference(sess, vocab, seq, sample, temp, in_state, out_state, seed='T'):
    """ Generate sequence one character at a time, based on the previous character
    """
    sentence = seed
    state = None
    for _ in range(LEN_GENERATED):
        batch = [vocab_encode(sentence[-1], vocab)]
        feed = {seq: batch, temp: TEMPRATURE}
        # for the first decoder step, the state is None
        if state is not None:
            feed.update({in_state: state})
        index, state = sess.run([sample, out_state], feed)
        sentence += vocab_decode(index, vocab)
    print(sentence)

In [119]:
#######################################################################################################################
# is used to generate the sentence from the model.                                                                    #
#######################################################################################################################

In [120]:
def main():
    vocab = (
            " $%'()+,-./0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ"
            "\\^_abcdefghijklmnopqrstuvwxyz{|}")
    seq = tf.placeholder(tf.int32, [None, None])
    temp = tf.placeholder(tf.float32)
    loss, sample, in_state, out_state = create_model(seq, temp, vocab)
    global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
    optimizer = tf.train.AdamOptimizer(LR).minimize(loss, global_step=global_step)
    training(vocab, seq, loss, optimizer, global_step, temp, sample, in_state, out_state)

In [121]:
#######################################################################################################################
# Main of the program. Here the vocab is defined as all the letters (upper cases and lower cases) and the symbols that#
# is possible in the language.                                                                                        #
#######################################################################################################################

In [122]:
if __name__ == '__main__':
    main()

Iter 39. 
    Loss 9390.82226562. Time 10.6757669449
TovH  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e  e
Iter 79. 
    Loss 8181.9296875. Time 9.15825414658
Th the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the th
Iter 119. 
    Loss 7406.81933594. Time 8.91854190826
The the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the t

The achieve an algorithms such as stochastic gradient descent in the computation of the network speed using an approximation on the computation of the computation of the computation of the computation of the computation of the computation of the computation of the computation of the computation of th
Iter 999. 
    Loss 3243.66064453. Time 9.7541949749
The accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy of the accuracy 
Iter 1039. 
    Loss 3593.83837891. Time 11.6461241245
The and an explored in the networks and a set of the interpretation and the resulting and a set of the interpretation and the resulting and a set of the interpretation and the resulting and a set of the interpretation and the resulting and a set of the interpretation and the resulting an

The activation function that the proposed RNN on a classification allows the state-of-the-art models and state-of-the-art models and state-of-the-art models and state-of-the-art minimized to computer vision, we propose a new algorithm is a simple complex one framework to a search for a solvel propaga
Iter 1919. 
    Loss 2708.12744141. Time 7.60294985771
The activation function and the approach is an accurate armilation of a neural networks are embedde in order to be the rease in a single madring models that imale compound. We propose a propremsto deep networks are embedded convergence rates of a subcompress the approximations in deep networks are ab
Iter 1959. 
    Loss 2697.81542969. Time 7.71944785118
The and the approach for difficulties is to a standard and representation is the main results in a distributed convergence rates in the context of the input sparsity relationship networks is an interpretables scale of the network structure of the norm of the input sparsity relationship

This paper provides the parallelism. We replication and computational capacity. We develop a topouthys to the output data and fully are component of parameters that deep neural networks to be a deep neural networks to be a deep neural networks (DNNs) a BL-12 that have been their can be used to transf
Iter 2839. 
    Loss 2521.32641602. Time 7.91025900841
This paper, we present a new that the first state-of-the-art on compared to analyze the algorithm is in a simple structure. This algorithms to learn a convex optimization problems with the input of the source domain adaptation, in the training parameters, and the training of the input stacked Resk mo
Iter 2879. 
    Loss 2283.21606445. Time 7.98815202713
The successive is competitive with state-of-the-art or biofertional results on a subset sampling algorithms to accurately effective infiked for functions that representations of the recently proposed $L_p$ unit on the same training of computations and state-of-the-art performance on de

The high prediction accuracy of MBN but learn that the now-deer $L_p$ unit is in compatible with state-of-the-art DNN and model by a deep network consisting of deep learning in deep learning making them input-space solver interpretation of the DNNs is a single model to several introduced and differen
Iter 3759. 
    Loss 2255.68994141. Time 7.92971897125
The effect of a finilg not of the existing research on knowledge to employ derived using HF transformation, and show that backpropagations, and show that backpropagations to a large model as multiple mapsifoc) computational language for acoustic models can be about the effect of a finilg a pooling op
Iter 3799. 
    Loss 2223.47851562. Time 7.89682507515
The convex optimization in this context a formal convergence the standard Spreveral neural networks with convolutional neural networks to speech effortor error computations for supervised pretraining on computation deep neural networks are able to perfort but also forwand pooling opera