In [13]:
# lets define some recursive sequences
import numpy as np
import matplotlib.pyplot as plt

# read in the text, transforming everything to lower case
text = open('datasets/holmes.txt').read().lower()
print('our original text has ' + str(len(text)) + ' characters')

### find and replace '\n' and '\r' symbols - replacing them 
text = text[1302:]
text = text.replace('\n',' ')    # replacing '\n' with '' simply removes the sequence
text = text.replace('\r',' ')

# find all unique characters in the text
a = list(set(text))

# remove as many non-english characters and character sequences as you can 
non_english = ['\xa8', '\xa9', '"', '%', '$', "'", '&', ')', '(', '*', '-', '/', '1', '0', '3', '2', '5', '4', '7', '6', '9', '8', '?', '@', '\xc3', '\xa0', '\xa2']
for i in non_english:
    text = text.replace(i,'')
text = text.replace('  ',' ')

# count the number of unique characters in the text
chars = sorted(list(set(text)))

# print some of the text, as well as statistics
print ("this corpus has " +  str(len(text)) + " total number of characters")
print ("this corpus has " +  str(len(chars)) + " unique characters")

### TODO: fill out the function below that transforms the input text and window-size into a set of input/output pairs for use with our RNN model
def window_transform_series(text,window_size,step_size):
    # containers for input/output pairs
    inputs = []
    outputs = []
    
    # window data
    count = 0
    for t in range(0,len(text) - window_size,step_size):
        # get input sequence
        temp_in = text[t:t + window_size]
        inputs.append(temp_in)
        
        # get corresponding target
        temp_target = text[t + window_size]
        outputs.append(temp_target)
    
    return inputs,outputs

# run your text window-ing function 
window_size = 100
step_size = 5
inputs, outputs = window_transform_series(text,window_size,step_size)

# print out a few of the input/output pairs to verify that we've made the right kind of stuff to learn from
print('input = ' + inputs[0])
print('output = ' + outputs[0])
print('--------------')
print('input = ' + inputs[100])
print('output = ' + outputs[100])

# print out the number of unique characters in the dataset
chars = sorted(list(set(text)))
print ("this corpus has " +  str(len(chars)) + " unique characters")

# this dictionary is a function mapping each unique character to a unique integer
chars_to_indices = dict((c, i) for i, c in enumerate(chars))  # map each unique character to unique integer

# this dictionary is a function mapping each unique integer back to a unique character
indices_to_chars = dict((i, c) for i, c in enumerate(chars))  # map each unique integer back to unique character

# transform character-based input/output into equivalent numerical versions
def encode_io_pairs(text,window_size,step_size):
    # number of unique chars
    chars = sorted(list(set(text)))
    num_chars = len(chars)
    
    # cut up text into character input/output pairs
    inputs, outputs = window_transform_series(text,window_size,step_size)
    
    # create empty vessels for one-hot encoded input/output
    X = np.zeros((len(inputs), window_size, num_chars), dtype=np.bool)
    y = np.zeros((len(inputs), num_chars), dtype=np.bool)
    
    # loop over inputs/outputs and tranform and store in X/y
    for i, sentence in enumerate(inputs):
        for t, char in enumerate(sentence):
            X[i, t, chars_to_indices[char]] = 1
        y[i, chars_to_indices[outputs[i]]] = 1
        
    return X,y

print inputs[0]
print '-------------'
print outputs[1]

# use your function
window_size = 50
step_size = 5
X,y = encode_io_pairs(text,window_size,step_size)

### necessary functions from the keras library
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import keras
import random

model = Sequential()
model.add(Dense(128, input_shape=(window_size,len(chars)), activation='linear'))
model.add(Dense(len(chars)))
optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

our original text has 594933 characters


input =  i have seldom heard him mention her under any other name. in his eyes she eclipses and predominates
output =  
--------------
input = h a gibe and a sneer. they were admirable things for the observerexcellent for drawing the veil from
output =  


this corpus has 32 unique characters


 i have seldom heard him mention her under any other name. in his eyes she eclipses and predominates
-------------
w


In [39]:
print np.shape(X)
print np.shape(y)

(114218, 50, 32)
(114218, 32)


In [45]:
# a small subset of our input/output pairs
Xsmall = X[:1000,:,:]
ysmall = y[:1000,:]

In [46]:
print np.shape(Xsmall)
print np.shape(ysmall)

(1000, 50, 32)
(1000, 32)


In [47]:
# train the model
model.fit(Xsmall, ysmall, batch_size=100, nb_epoch=20,verbose = 1)

ValueError: Error when checking model target: expected dense_7 to have 3 dimensions, but got array with shape (1000, 32)

In [None]:
# function that uses trained model to predict a desired number of future characters
def predict_next_chars(model,input_chars,num_to_predict):     
    # create output
    predicted_chars = ''
    for i in range(num_to_predict):
        # convert this round's predicted characters to numerical input    
        x_test = np.zeros((1, window_size, len(chars)))
        for t, char in enumerate(input_chars):
            x_test[0, t, chars_to_indices[char]] = 1.

        # make this round's prediction
        test_predict = model.predict(x_test,verbose = 0)[0]

        # translate numerical prediction back to characters
        r = np.argmax(test_predict)                           # predict class of each test input
        d = indices_to_chars[r] 

        # update predicted_chars and input
        predicted_chars+=d
        input_chars+=d
        input_chars = input_chars[1:]
    return predicted_chars

In [None]:
# TODO: choose an input sequence and use the prediction function in the previous Python cell to predict 100 characters following it

# get an appropriately sized chunk of characters from the text
start_inds = [10,100,200]
for s in start_inds:
    start_index = s
    input_chars = text[start_index: start_index + window_size]

    # use the prediction function
    predict_input = predict_next_chars(model,input_chars,num_to_predict = 100)

    # print out input characters
    print('------------------')
    input_line = 'input chars = ' + '\n' +  input_chars + '"' + '\n'
    print(input_line)

    # print out predicted characters
    line = 'predicted chars = ' + '\n' +  predict_input + '"' + '\n'
    print(line)