# Introduction

The following was inspired by the following articles

- Medium
    - How to build a Recurrent Neural Network in TensorFlow [[1]](https://medium.com/@erikhallstrm/hello-world-rnn-83cd7105b767)[[2]](https://medium.com/@erikhallstrm/tensorflow-rnn-api-2bb31821b185)[[3]](https://medium.com/@erikhallstrm/using-the-tensorflow-lstm-api-3-7-5f2b97ca6b73)[[4]](https://medium.com/@erikhallstrm/using-the-tensorflow-multilayered-lstm-api-f6e7da7bbe40)[[5]](https://medium.com/@erikhallstrm/using-the-dynamicrnn-api-in-tensorflow-7237aba7f7ea)[[6]](https://medium.com/@erikhallstrm/using-the-dropout-api-in-tensorflow-2b2e6561dfeb)  
    - [RNN example by Python](https://towardsdatascience.com/recurrent-neural-networks-by-example-in-python-ffd204f99470)       
- GitRepos
    - [char-rnn-tensorflow](https://github.com/sherjilozair/char-rnn-tensorflow)
- Kaggle Repos
    - [Learn by example RNN/LSTM/GRU time series](https://www.kaggle.com/charel/learn-by-example-rnn-lstm-gru-time-series)
- machinelearningmaster
    - [How to Develop a Character-Based Neural Language Model in Keras](https://machinelearningmastery.com/develop-character-based-neural-language-model-keras/)
    - [Adventures of machine learning](http://adventuresinmachinelearning.com/keras-lstm-tutorial/)
-  Troubleshooting
    - [Input size of the LSTM layer](https://github.com/keras-team/keras/issues/2045)

# Imports

In [1]:
import numpy as np

# load pickle to compress data for faster procesing
from pickle import dump

# load in tensor flow and kera framework
import tensorflow as tf

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, TimeDistributed, Activation

# load in plotting just in case we want to plot something
import matplotlib.pyplot as plt

Using TensorFlow backend.


# Functions

## I/O Stuff

### Reading

In [2]:
def readData(FileDir):
    # Get the database from a list of characters that is found in the input.txt
    # should be simple plain text file
    data = open(FileDir, 'r').read() 
    return data

### Cleaning


In [3]:
def removeRedundantSpacing(data):
    tokens = data.split()
    clean_data = ' '.join(tokens)
    return clean_data

### Encode Sequences
The sequences of characters must be encoded as integers. This means that each unique character will be assigned a specific integer value and each sequence of characters will be encoded as a sequence of integers.

In [4]:
def generateSequence(data):
    
    # We can create the mapping given a sorted set of unique characters in the raw input data. 
    # The mapping is a dictionary of character values to integer values.
    chars = sorted(list(set(data)))
    mapping = dict((c, i) for i, c in enumerate(chars))
    
    #Next, we can process each sequence of characters one at a time and use the dictionary mapping 
    # to look up the integer value for each character.
    sequences = list()
    encoded_seq = [mapping[char] for char in data]        
    
    vocab_size = len(mapping)
    
    return encoded_seq,vocab_size

In [5]:
def generateSequenceLines(data,numTimeSteps):
    
    lines = data.split('\n')
    
    # We can create the mapping given a sorted set of unique characters in the raw input data. 
    # The mapping is a dictionary of character values to integer values.
    chars = sorted(list(set(data)))
    mapping = dict((c, i) for i, c in enumerate(chars))
    
    #Next, we can process each sequence of characters one at a time and use the dictionary mapping 
    # to look up the integer value for each character.
    sequences = list()
    timeStepTick = 0
    for line in lines:
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    
    vocab_size = len(mapping)
    
    return sequences,vocab_size

# Parameters

## Constants and Model Parameters

In [6]:
lines = False

## Hyperparameters

In [7]:
numOfHiddenStates = 100

# Main Run

In [8]:
# ----------------------------------  I/O
# Read and clean the data
fileDir = 'data/rnn/input.txt'
data = readData(fileDir)

# Sequence the data
dataSeq,vocab_size = generateSequence(data)

# Get x and y 
# NOTE: x is the data and y is the future so it is just a lag basically
X, y = dataSeq[:-1], dataSeq[1:]

# One hot encode the data
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = np.array(sequences)
y = to_categorical(y, num_classes=vocab_size)

# # Model Generation
model = Sequential()
model.add(LSTM(numOfHiddenStates, input_shape=(np.shape(X)[0], np.shape(X)[1])))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100)               57200     
_________________________________________________________________
dense_1 (Dense)              (None, 42)                4242      
Total params: 61,442
Trainable params: 61,442
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
np.shape(X)

(2420, 42)

In [None]:
# Compile and fit the model
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=512, epochs=10)

# Full

In [None]:
fileDir = 'data/rnn/input.txt'

In [None]:
from numpy import array
from pickle import dump
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load
in_filename = 'char_sequences.txt'
raw_text = load_doc(fileDir)
lines = raw_text.split('\n')

# integer encode sequences of characters
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))
sequences = list()
for line in lines:
	# integer encode line
	encoded_seq = [mapping[char] for char in line]
	# store
	sequences.append(encoded_seq)

# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)

# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y, num_classes=vocab_size)

# define model
model = Sequential()
model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, epochs=100, verbose=2)

# save the model to file
model.save('model.h5')
# save the mapping
dump(mapping, open('mapping.pkl', 'wb'))

In [None]:
X.shape[1]

In [None]:
sequences[0,1]