COMP 543 A6
Nai-Fan Chen (nc41)
Jialei Zhou (jz74)


In [4]:
import numpy as np
import tensorflow as tf
import collections

# the number of iterations to train for
numTrainingIters = 10000
# the number of hidden neurons that hold the state of the RNN
hiddenUnits = 200

# the number of classes that we are learning over
numClasses = 5

# the number of data points in a batch
batchSize = 200

# the learning rate
learningRate = 0.01

# this function takes a dictionary (called data) which contains 
# of (dataPointID, (classNumber, matrix)) entries.  Each matrix
# is a sequence of vectors; each vector has a one-hot-encoding of
# an ascii character, and the sequence of vectors corresponds to
# one line of text.  classNumber indicates which file the line of
# text came from.  
# 
# The argument maxSeqLen is the maximum length of a line of text
# seen so far.  fileName is the name of a file whose contents
# we want to add to data.  classNum is an indicator of the class
# we are going to associate with text from that file.  linesToUse
# tells us how many lines to sample from the file.
#
# The return val is the new maxSeqLen, as well as the new data
# dictionary with the additional lines of text added
def addToData (maxSeqLen, data, fileName, classNum, linesToUse):
    #
    # open the file and read it in
    with open(fileName) as f:
        content = f.readlines()
    #
    # sample linesToUse numbers; these will tell us what lines
    # from the text file we will use
    # [Note] random_integers genetate a vector with size "linesToUse", rand from 0 to len(content)
    myInts = np.random.randint (0, len(content) - 1, linesToUse)
    #
    # i is the key of the next line of text to add to the dictionary
    # [Note] dictionary is called "data" in this case, so i is the length of dictionary
    i = len(data)
    #
    # loop thru and add the lines of text to the dictionary
    for whichLine in myInts.flat: # myInts.flat is a 1-D interator over myInts
        #
        # get the line and ignore it if it has nothing in it
        line = content[whichLine]
        if line.isspace () or len(line) == 0:
            continue;
        #
        # take note if this is the longest line we've seen
        if len (line) > maxSeqLen:
            maxSeqLen = len (line)
        #
        # create the matrix that will hold this line
        temp = np.zeros((len(line), 256))
        #
        # j is the character we are on
        j = 0
        # 
        # loop thru the characters
        for ch in line:
            #
            # non-ascii? ignore
            if ord(ch) >= 256: # ord(c) gives the unicode of c
                continue
            #
            # one hot!
            temp[j][ord(ch)] = 1 # mark the ascii index 
            # 
            # move onto the next character
            j = j + 1
            #
        # remember the line of text
        # add this (class number, matrix_of_line) to end of data (dictionary)
        data[i] = (classNum, temp)
        #
        # move onto the next line
        i = i + 1
    #
    # and return the dictionary with the new data
    return (maxSeqLen, data) # (max length of the line in file, and the dictionary)

# this function takes as input a data set encoded as a dictionary
# (same encoding as the last function) and pre-pends every line of
# text with empty characters so that each line of text is exactly
# maxSeqLen characters in size
def pad (maxSeqLen, data):
   #
   # loop thru every line of text
   for i in data:
        #
        # access the matrix and the label
        temp = data[i][1]
        label = data[i][0]
        # 
        # get the number of chatacters in this line
        len = temp.shape[0]
        #
        # and then pad so the line is the correct length
        padding = np.zeros ((maxSeqLen - len,256)) 
        data[i] = (label, np.transpose (np.concatenate ((padding, temp), axis = 0)))
   #
   # return the new data set
   return data

# this generates a new batch of training data of size batchSize from the
# list of lines of text data. This version of generateData is useful for
# an RNN because the data set x is a NumPy array with dimensions
# [batchSize, 256, maxSeqLen]; it can be unstacked into a series of
# matrices containing one-hot character encodings for each data point
# using tf.unstack(inputX, axis=2)
def generateDataRNN (maxSeqLen, data):
    #
    # randomly sample batchSize lines of text
    myInts = np.random.randint (0, len(data) - 1, batchSize)
    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack (data[i][1] for i in myInts.flat)
    #
    # and stack all of the labels into a vector of labels
    y = np.stack (np.array((data[i][0])) for i in myInts.flat)
    #
    # return the pair
    return (x, y)

# this also generates a new batch of training data, but it represents
# the data as a NumPy array with dimensions [batchSize, 256 * maxSeqLen]
# where for each data point, all characters have been appended.  Useful
# for feed-forward network training
def generateDataFeedForward (maxSeqLen, data):
    #
    # randomly sample batchSize lines of text
    myInts = np.random.randint (0, len(data) - 1, batchSize)
    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack (data[i][1].flatten () for i in myInts.flat) # flatten turns matrix into 1-D form
    #
    # and stack all of the labels into a vector of labels
    y = np.stack (np.array((data[i][0])) for i in myInts.flat)
    #
    # return the pair
    return (x, y)

# create the data dictionary
maxSeqLen = 0
data = {}

# load up the five data sets
(maxSeqLen, data) = addToData (maxSeqLen, data, "biochemistry_processed.txt", 0, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "cancerResearch_processed.txt", 1, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "jama_processed.txt", 2, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "nature_processed.txt", 3, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "plosOne_processed.txt", 4, 10000)

# pad each entry in the dictionary with empty characters as needed so
# that the sequences are all of the same length
data = pad (maxSeqLen, data)
        
# now we build the TensorFlow computation... there are two inputs, 
# a batch of text lines and a batch of labels
inputX = tf.placeholder(tf.float32, [batchSize, 256, maxSeqLen])
inputY = tf.placeholder(tf.int32, [batchSize])

# this is the inital state of the RNN, before processing any data
initialState = tf.placeholder(tf.float32, [batchSize, hiddenUnits])

# the weight matrix that maps the inputs and hidden state to a set of values
W = tf.Variable(np.random.normal(0, 0.05, (hiddenUnits*2 + 256, hiddenUnits)), dtype=tf.float32)

# biaes for the hidden values
b = tf.Variable(np.zeros((1, hiddenUnits)), dtype=tf.float32)

# weights and bias for the final classification
W2 = tf.Variable(np.random.normal (0, 0.05, (hiddenUnits, numClasses)),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,numClasses)), dtype=tf.float32)

# unpack the input sequences so that we have a series of matrices,
# each of which has a one-hot encoding of the current character from
# every input sequence
sequenceOfLetters = tf.unstack(inputX, axis=2)

# now we implement the forward pass
currentState = initialState

oldState = collections.deque()
oldState.append(currentState)
padState = tf.Variable(np.zeros((batchSize,hiddenUnits)), dtype=tf.float32)
for timeTick in sequenceOfLetters:
    #
    # concatenate the state with the input, then compute the next state
    if len(oldState) <= 10:
      inputPlusState = tf.concat([timeTick, currentState, padState], 1) 
    else:
      inputPlusState = tf.concat([timeTick, currentState, oldState.popleft()], 1)
    next_state = tf.tanh(tf.matmul(inputPlusState, W) + b) 
    currentState = next_state
    oldState.append(currentState)

# compute the set of outputs
outputs = tf.matmul(currentState, W2) + b2 # matmul

predictions = tf.nn.softmax(outputs) # softmax

# compute the loss
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=inputY)
totalLoss = tf.reduce_mean(losses)

# use gradient descent to train
#trainingAlg = tf.train.GradientDescentOptimizer(learningRate).minimize(totalLoss)
trainingAlg = tf.train.AdagradOptimizer(learning_rate=learningRate).minimize(totalLoss)
# trainingAlg = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(totalLoss)
# trainingAlg = tf.train.AdadeltaOptimizer(learning_rate=learningRate).minimize(totalLoss)

# and train!!
with tf.Session() as sess:
    #
    # initialize everything
    sess.run(tf.global_variables_initializer())
    #
    # and run the training iters
    _accLoss = 0.0
    _accCount = 0
    for epoch in range(numTrainingIters):
        # 
        # get some data
        x, y = generateDataRNN (maxSeqLen, data)
        #
        # do the training epoch
        _currentState = np.zeros((batchSize, hiddenUnits))
        _totalLoss, _trainingAlg, _currentState, _predictions, _outputs = sess.run(
                [totalLoss, trainingAlg, currentState, predictions, outputs],
                feed_dict={
                    inputX:x,
                    inputY:y,
                    initialState:_currentState
                })
        #
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y)):
            maxPos = -1
            maxVal = 0.0
            for j in range (numClasses):
                if maxVal < _predictions[i][j]:
                    maxVal = _predictions[i][j]
                    maxPos = j
            if maxPos == y[i]:
                numCorrect = numCorrect + 1
        #
        # print out to the screen
#         if epoch%100 == 0: 
        if epoch >= numTrainingIters - 20: 
            print("Step", epoch, "Loss", _totalLoss, "Correct", numCorrect, "out of", batchSize)

        if epoch >= numTrainingIters - 10:
            _accLoss += _totalLoss
            _accCount += numCorrect 
    
    print("Average loss for the last 10 mini-batches is", _accLoss / 10, 
          "average correct labels is", _accCount / 10, "out of", batchSize, '.')
    print("Accuracy(%):", (_accCount/10) / batchSize)

Step 9980 Loss 0.9637191 Correct 126 out of 200
Step 9981 Loss 0.93740684 Correct 128 out of 200
Step 9982 Loss 1.092393 Correct 121 out of 200
Step 9983 Loss 1.0634432 Correct 107 out of 200
Step 9984 Loss 1.0598913 Correct 106 out of 200
Step 9985 Loss 1.0459412 Correct 120 out of 200
Step 9986 Loss 1.0308971 Correct 120 out of 200
Step 9987 Loss 1.0298545 Correct 117 out of 200
Step 9988 Loss 1.1359423 Correct 108 out of 200
Step 9989 Loss 1.0604455 Correct 118 out of 200
Step 9990 Loss 0.9990126 Correct 121 out of 200
Step 9991 Loss 0.9842941 Correct 124 out of 200
Step 9992 Loss 1.0265788 Correct 123 out of 200
Step 9993 Loss 1.0259724 Correct 116 out of 200
Step 9994 Loss 1.0228249 Correct 118 out of 200
Step 9995 Loss 1.0521476 Correct 112 out of 200
Step 9996 Loss 1.0664648 Correct 127 out of 200
Step 9997 Loss 1.0461231 Correct 109 out of 200
Step 9998 Loss 1.0624418 Correct 113 out of 200
Step 9999 Loss 1.0065873 Correct 119 out of 200
Average loss for the last 10 mini-batche

In [5]:
import numpy as np
import tensorflow as tf
import collections

# the number of iterations to train for
numTrainingIters = 10000
# the number of hidden neurons that hold the state of the RNN
hiddenUnits = 1000

# the number of classes that we are learning over
numClasses = 5

# the number of data points in a batch
batchSize = 150

# the learning rate
learningRate = 0.01

# this function takes a dictionary (called data) which contains 
# of (dataPointID, (classNumber, matrix)) entries.  Each matrix
# is a sequence of vectors; each vector has a one-hot-encoding of
# an ascii character, and the sequence of vectors corresponds to
# one line of text.  classNumber indicates which file the line of
# text came from.  
# 
# The argument maxSeqLen is the maximum length of a line of text
# seen so far.  fileName is the name of a file whose contents
# we want to add to data.  classNum is an indicator of the class
# we are going to associate with text from that file.  linesToUse
# tells us how many lines to sample from the file.
#
# The return val is the new maxSeqLen, as well as the new data
# dictionary with the additional lines of text added
def addToData (maxSeqLen, data, fileName, classNum, linesToUse):
    #
    # open the file and read it in
    with open(fileName) as f:
        content = f.readlines()
    #
    # sample linesToUse numbers; these will tell us what lines
    # from the text file we will use
    # [Note] random_integers genetate a vector with size "linesToUse", rand from 0 to len(content)
    myInts = np.random.randint (0, len(content) - 1, linesToUse)
    #
    # i is the key of the next line of text to add to the dictionary
    # [Note] dictionary is called "data" in this case, so i is the length of dictionary
    i = len(data)
    #
    # loop thru and add the lines of text to the dictionary
    for whichLine in myInts.flat: # myInts.flat is a 1-D interator over myInts
        #
        # get the line and ignore it if it has nothing in it
        line = content[whichLine]
        if line.isspace () or len(line) == 0:
            continue;
        #
        # take note if this is the longest line we've seen
        if len (line) > maxSeqLen:
            maxSeqLen = len (line)
        #
        # create the matrix that will hold this line
        temp = np.zeros((len(line), 256))
        #
        # j is the character we are on
        j = 0
        # 
        # loop thru the characters
        for ch in line:
            #
            # non-ascii? ignore
            if ord(ch) >= 256: # ord(c) gives the unicode of c
                continue
            #
            # one hot!
            temp[j][ord(ch)] = 1 # mark the ascii index 
            # 
            # move onto the next character
            j = j + 1
            #
        # remember the line of text
        # add this (class number, matrix_of_line) to end of data (dictionary)
        data[i] = (classNum, temp)
        #
        # move onto the next line
        i = i + 1
    #
    # and return the dictionary with the new data
    return (maxSeqLen, data) # (max length of the line in file, and the dictionary)

# this function takes as input a data set encoded as a dictionary
# (same encoding as the last function) and pre-pends every line of
# text with empty characters so that each line of text is exactly
# maxSeqLen characters in size
def pad (maxSeqLen, data):
   #
   # loop thru every line of text
   for i in data:
        #
        # access the matrix and the label
        temp = data[i][1]
        label = data[i][0]
        # 
        # get the number of chatacters in this line
        len = temp.shape[0]
        #
        # and then pad so the line is the correct length
        padding = np.zeros ((maxSeqLen - len,256)) 
        data[i] = (label, np.transpose (np.concatenate ((padding, temp), axis = 0)))
   #
   # return the new data set
   return data

# this generates a new batch of training data of size batchSize from the
# list of lines of text data. This version of generateData is useful for
# an RNN because the data set x is a NumPy array with dimensions
# [batchSize, 256, maxSeqLen]; it can be unstacked into a series of
# matrices containing one-hot character encodings for each data point
# using tf.unstack(inputX, axis=2)
def generateDataRNN (maxSeqLen, data):
    #
    # randomly sample batchSize lines of text
    myInts = np.random.randint (0, len(data) - 1, batchSize)
    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack (data[i][1] for i in myInts.flat)
    #
    # and stack all of the labels into a vector of labels
    y = np.stack (np.array((data[i][0])) for i in myInts.flat)
    #
    # return the pair
    return (x, y)

# this also generates a new batch of training data, but it represents
# the data as a NumPy array with dimensions [batchSize, 256 * maxSeqLen]
# where for each data point, all characters have been appended.  Useful
# for feed-forward network training
def generateDataFeedForward (maxSeqLen, data):
    #
    # randomly sample batchSize lines of text
    myInts = np.random.randint (0, len(data) - 1, batchSize)
    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack (data[i][1].flatten () for i in myInts.flat) # flatten turns matrix into 1-D form
    #
    # and stack all of the labels into a vector of labels
    y = np.stack (np.array((data[i][0])) for i in myInts.flat)
    #
    # return the pair
    return (x, y)

# create the data dictionary
maxSeqLen = 0
data = {}

# load up the five data sets
(maxSeqLen, data) = addToData (maxSeqLen, data, "biochemistry_processed.txt", 0, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "cancerResearch_processed.txt", 1, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "jama_processed.txt", 2, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "nature_processed.txt", 3, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "plosOne_processed.txt", 4, 10000)

# pad each entry in the dictionary with empty characters as needed so
# that the sequences are all of the same length
data = pad (maxSeqLen, data)
        
# now we build the TensorFlow computation... there are two inputs, 
# a batch of text lines and a batch of labels
inputX = tf.placeholder(tf.float32, [batchSize, 256*maxSeqLen])
inputY = tf.placeholder(tf.int32, [batchSize])

# this is the inital state of the RNN, before processing any data
initialState = tf.placeholder(tf.float32, [batchSize, hiddenUnits])

# the weight matrix that maps the inputs and hidden state to a set of values
W = tf.Variable(np.random.normal(0, 0.05, (256*maxSeqLen, hiddenUnits)), dtype=tf.float32)

# biaes for the hidden values
b = tf.Variable(np.zeros((1, hiddenUnits)), dtype=tf.float32)

# weights and bias for the final classification
W2 = tf.Variable(np.random.normal (0, 0.05, (hiddenUnits, numClasses)),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,numClasses)), dtype=tf.float32)

# unpack the input sequences so that we have a series of matrices,
# each of which has a one-hot encoding of the current character from
# every input sequence
# sequenceOfLetters = tf.unstack(inputX, axis=2)

# now we implement the forward pass
currentState = tf.tanh(tf.matmul(inputX, W) + b)

# oldState = collections.deque()
# oldState.append(currentState)
# padState = tf.Variable(np.zeros((batchSize,hiddenUnits)), dtype=tf.float32)
# for timeTick in sequenceOfLetters:
#     #
#     # concatenate the state with the input, then compute the next state
#     if len(oldState) <= 10:
#       inputPlusState = tf.concat([timeTick, currentState, padState], 1) 
#     else:
#       inputPlusState = tf.concat([timeTick, currentState, oldState.popleft()], 1)
#     next_state = tf.tanh(tf.matmul(inputPlusState, W) + b) 
#     currentState = next_state
#     oldState.append(currentState)

# compute the set of outputs
outputs = tf.matmul(currentState, W2) + b2 # matmul

predictions = tf.nn.softmax(outputs) # softmax

# compute the loss
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=inputY)
totalLoss = tf.reduce_mean(losses)

# use gradient descent to train
#trainingAlg = tf.train.GradientDescentOptimizer(learningRate).minimize(totalLoss)
# trainingAlg = tf.train.AdagradOptimizer(learning_rate=learningRate).minimize(totalLoss)
# trainingAlg = tf.train.AdadeltaOptimizer(learning_rate=learningRate).minimize(totalLoss)
trainingAlg = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(totalLoss)

# and train!!
with tf.Session() as sess:
    #
    # initialize everything
    sess.run(tf.global_variables_initializer())
    #
    # and run the training iters
    _accLoss = 0.0
    _accCount = 0
    for epoch in range(numTrainingIters):
        # 
        # get some data
        x, y = generateDataFeedForward(maxSeqLen, data)
        #
        # do the training epoch
        _currentState = np.zeros((batchSize, hiddenUnits))
        _totalLoss, _trainingAlg, _currentState, _predictions, _outputs = sess.run(
                [totalLoss, trainingAlg, currentState, predictions, outputs],
                feed_dict={
                    inputX:x,
                    inputY:y,
                })
        #
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y)):
            maxPos = -1
            maxVal = 0.0
            for j in range (numClasses):
                if maxVal < _predictions[i][j]:
                    maxVal = _predictions[i][j]
                    maxPos = j
            if maxPos == y[i]:
                numCorrect = numCorrect + 1
        #
        # print out to the screen
#         if epoch%100 == 0: 
        if epoch >= numTrainingIters - 20: 
            print("Step", epoch, "Loss", _totalLoss, "Correct", numCorrect, "out of", batchSize)

        if epoch >= numTrainingIters - 10:
            _accLoss += _totalLoss
            _accCount += numCorrect 
    
    print("Average loss for the last 10 mini-batches is", _accLoss / 10, 
          "average correct labels is", _accCount / 10, "out of 150.")
    print("Accuracy(%):", (_accCount/10) / batchSize)

Step 9980 Loss 0.10319737 Correct 147 out of 150
Step 9981 Loss 0.033834197 Correct 148 out of 150
Step 9982 Loss 0.17667341 Correct 144 out of 150
Step 9983 Loss 0.21053162 Correct 144 out of 150
Step 9984 Loss 0.11489423 Correct 147 out of 150
Step 9985 Loss 0.13874622 Correct 145 out of 150
Step 9986 Loss 0.1746055 Correct 144 out of 150
Step 9987 Loss 0.1351292 Correct 145 out of 150
Step 9988 Loss 0.15116279 Correct 145 out of 150
Step 9989 Loss 0.14264633 Correct 147 out of 150
Step 9990 Loss 0.1256092 Correct 146 out of 150
Step 9991 Loss 0.08521423 Correct 145 out of 150
Step 9992 Loss 0.22226799 Correct 142 out of 150
Step 9993 Loss 0.13218355 Correct 145 out of 150
Step 9994 Loss 0.17884059 Correct 146 out of 150
Step 9995 Loss 0.14067146 Correct 144 out of 150
Step 9996 Loss 0.17855258 Correct 145 out of 150
Step 9997 Loss 0.23175618 Correct 146 out of 150
Step 9998 Loss 0.17987251 Correct 146 out of 150
Step 9999 Loss 0.17949745 Correct 146 out of 150
Average loss for the l

In [3]:
import numpy as np
import tensorflow as tf

tf.reset_default_graph()

# the number of iterations to train for
numTrainingIters = 10000

# the number of hidden neurons that hold the state of the RNN
hiddenUnits = 800

# the number of classes that we are learning over
numClasses = 5

# the number of data points in a batch
batchSize = 150

# the learning rate
learningRate = 0.01

timesteps = 28


# this function takes a dictionary (called data) which contains 
# of (dataPointID, (classNumber, matrix)) entries.  Each matrix
# is a sequence of vectors; each vector has a one-hot-encoding of
# an ascii character, and the sequence of vectors corresponds to
# one line of text.  classNumber indicates which file the line of
# text came from.  
# 
# The argument maxSeqLen is the maximum length of a line of text
# seen so far.  fileName is the name of a file whose contents
# we want to add to data.  classNum is an indicator of the class
# we are going to associate with text from that file.  linesToUse
# tells us how many lines to sample from the file.
#
# The return val is the new maxSeqLen, as well as the new data
# dictionary with the additional lines of text added
def addToData (maxSeqLen, data, fileName, classNum, linesToUse):
    #
    # open the file and read it in
    with open(fileName) as f:
        content = f.readlines()
    #
    # sample linesToUse numbers; these will tell us what lines
    # from the text file we will use
    # [Note] random_integers genetate a vector with size "linesToUse", rand from 0 to len(content)
    myInts = np.random.randint (0, len(content) - 1, linesToUse)
    #
    # i is the key of the next line of text to add to the dictionary
    # [Note] dictionary is called "data" in this case, so i is the length of dictionary
    i = len(data)
    #
    # loop thru and add the lines of text to the dictionary
    for whichLine in myInts.flat: # myInts.flat is a 1-D interator over myInts
        #
        # get the line and ignore it if it has nothing in it
        line = content[whichLine]
        if line.isspace () or len(line) == 0:
            continue;
        #
        # take note if this is the longest line we've seen
        if len (line) > maxSeqLen:
            maxSeqLen = len (line)
        #
        # create the matrix that will hold this line
        temp = np.zeros((len(line), 256))
        #
        # j is the character we are on
        j = 0
        # 
        # loop thru the characters
        for ch in line:
            #
            # non-ascii? ignore
            if ord(ch) >= 256: # ord(c) gives the unicode of c
                continue
            #
            # one hot!
            temp[j][ord(ch)] = 1 # mark the ascii index 
            # 
            # move onto the next character
            j = j + 1
            #
        # remember the line of text
        # add this (class number, matrix_of_line) to end of data (dictionary)
        data[i] = (classNum, temp)
        #
        # move onto the next line
        i = i + 1
    #
    # and return the dictionary with the new data
    return (maxSeqLen, data) # (max length of the line in file, and the dictionary)

# this function takes as input a data set encoded as a dictionary
# (same encoding as the last function) and pre-pends every line of
# text with empty characters so that each line of text is exactly
# maxSeqLen characters in size
def pad (maxSeqLen, data):
   #
   # loop thru every line of text
   for i in data:
        #
        # access the matrix and the label
        temp = data[i][1]
        label = data[i][0]
        # 
        # get the number of chatacters in this line
        len = temp.shape[0]
        #
        # and then pad so the line is the correct length
        padding = np.zeros ((maxSeqLen - len,256)) 
        data[i] = (label, np.transpose (np.concatenate ((padding, temp), axis = 0)))
   #
   # return the new data set
   return data

# this generates a new batch of training data of size batchSize from the
# list of lines of text data. This version of generateData is useful for
# an RNN because the data set x is a NumPy array with dimensions
# [batchSize, 256, maxSeqLen]; it can be unstacked into a series of
# matrices containing one-hot character encodings for each data point
# using tf.unstack(inputX, axis=2)
def generateDataRNN (maxSeqLen, data):
    #
    # randomly sample batchSize lines of text
    myInts = np.random.randint (0, len(data) - 1, batchSize)
    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack (data[i][1] for i in myInts.flat)
    #
    # and stack all of the labels into a vector of labels
    y = np.stack (np.array((data[i][0])) for i in myInts.flat)
    #
    # return the pair
    return (x, y)

# this also generates a new batch of training data, but it represents
# the data as a NumPy array with dimensions [batchSize, 256 * maxSeqLen]
# where for each data point, all characters have been appended.  Useful
# for feed-forward network training
def generateDataFeedForward (maxSeqLen, data):
    #
    # randomly sample batchSize lines of text
    myInts = np.random.randint (0, len(data) - 1, batchSize)
    #
    # stack all of the text into a matrix of one-hot characters
    x = np.stack (data[i][1].flatten () for i in myInts.flat) # flatten turns matrix into 1-D form
    #
    # and stack all of the labels into a vector of labels
    y = np.stack (np.array((data[i][0])) for i in myInts.flat)
    #
    # return the pair
    return (x, y)

# create the data dictionary
maxSeqLen = 0
data = {}

# load up the five data sets
(maxSeqLen, data) = addToData (maxSeqLen, data, "biochemistry_processed.txt", 0, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "cancerResearch_processed.txt", 1, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "jama_processed.txt", 2, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "nature_processed.txt", 3, 10000)
(maxSeqLen, data) = addToData (maxSeqLen, data, "plosOne_processed.txt", 4, 10000)

# pad each entry in the dictionary with empty characters as needed so
# that the sequences are all of the same length
data = pad (maxSeqLen, data)
        
# now we build the TensorFlow computation... there are two inputs, 
# a batch of text lines and a batch of labels
inputX = tf.placeholder(tf.float32, [batchSize, 256, maxSeqLen])
inputY = tf.placeholder(tf.int32, [batchSize])

# this is the inital state of the RNN, before processing any data
initialState = tf.placeholder(tf.float32, [batchSize, hiddenUnits])

# the weight matrix that maps the inputs and hidden state to a set of values
W = tf.Variable(np.random.normal(0, 0.05, (hiddenUnits + 256, hiddenUnits)), dtype=tf.float32)

# biaes for the hidden values
b = tf.Variable(np.zeros((1, hiddenUnits)), dtype=tf.float32)

# weights and bias for the final classification
W2 = tf.Variable(np.random.normal (0, 0.05, (hiddenUnits, numClasses)),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,numClasses)), dtype=tf.float32)

# unpack the input sequences so that we have a series of matrices,
# each of which has a one-hot encoding of the current character from
# every input sequence
sequenceOfLetters = tf.unstack(inputX, axis=2)


lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hiddenUnits, forget_bias=0.8)
outputs, states = tf.nn.static_rnn(lstm_cell, sequenceOfLetters, dtype=tf.float32)

logits = tf.matmul(outputs[-1], W2) + b2 # matmul

predictions = tf.nn.softmax(logits) # softmax

# compute the loss
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=inputY)
totalLoss = tf.reduce_mean(losses)

# use gradient descent to train
# trainingAlg = tf.train.GradientDescentOptimizer(learningRate).minimize(totalLoss)
# trainingAlg = tf.train.AdagradOptimizer(learning_rate=learningRate).minimize(totalLoss)
trainingAlg = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(totalLoss)

# and train!!
with tf.Session() as sess:
    #
    # initialize everything
    sess.run(tf.global_variables_initializer())
    #
    # and run the training iters
    _accLoss = 0.0
    _accCount = 0
    for epoch in range(numTrainingIters):
        # 
        # get some data
        x, y = generateDataRNN (maxSeqLen, data)
        #
        # do the training epoch
        _currentState = np.zeros((batchSize, hiddenUnits))
        _totalLoss, _trainingAlg, _predictions, _outputs = sess.run(
                [totalLoss, trainingAlg, predictions, outputs],
                feed_dict={
                    inputX:x,
                    inputY:y,
                })
        #
        # just FYI, compute the number of correct predictions
        numCorrect = 0
        for i in range (len(y)):
            maxPos = -1
            maxVal = 0.0
            for j in range (numClasses):
                if maxVal < _predictions[i][j]:
                    maxVal = _predictions[i][j]
                    maxPos = j
            if maxPos == y[i]:
                numCorrect = numCorrect + 1
        #
        # print out to the screen
#         if epoch%100 == 0: 
        if epoch >= numTrainingIters - 20: 
            print("Step", epoch, "Loss", _totalLoss, "Correct", numCorrect, "out of", batchSize)

        if epoch >= numTrainingIters - 10:
            _accLoss += _totalLoss
            _accCount += numCorrect 
    
    print("Average loss for the last 10 mini-batches is", _accLoss / 10, 
          "average correct labels is", _accCount / 10, "out of 150.")
    print("Accuracy(%):", (_accCount/10) / batchSize)

Step 9980 Loss 1.1628346 Correct 72 out of 150
Step 9981 Loss 1.1389586 Correct 74 out of 150
Step 9982 Loss 1.2558727 Correct 70 out of 150
Step 9983 Loss 1.1379296 Correct 74 out of 150
Step 9984 Loss 1.0801747 Correct 88 out of 150
Step 9985 Loss 1.1767743 Correct 83 out of 150
Step 9986 Loss 1.1600622 Correct 76 out of 150
Step 9987 Loss 1.1800702 Correct 83 out of 150
Step 9988 Loss 1.1132063 Correct 87 out of 150
Step 9989 Loss 1.085437 Correct 84 out of 150
Step 9990 Loss 1.1571736 Correct 82 out of 150
Step 9991 Loss 1.1632593 Correct 79 out of 150
Step 9992 Loss 1.1279404 Correct 87 out of 150
Step 9993 Loss 1.1347382 Correct 81 out of 150
Step 9994 Loss 1.0850183 Correct 79 out of 150
Step 9995 Loss 1.1645771 Correct 83 out of 150
Step 9996 Loss 1.0962888 Correct 83 out of 150
Step 9997 Loss 1.0850582 Correct 85 out of 150
Step 9998 Loss 1.1552424 Correct 81 out of 150
Step 9999 Loss 1.2171291 Correct 67 out of 150
Average loss for the last 10 mini-batches is 1.13864254951477