## Load Data

In [8]:
from numpy import array
from pickle import dump
from pickle import load
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [9]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load text
raw_text = load_doc('atotc.txt')

# raw_text = raw_text[:10000]

In [10]:
# clean
tokens = raw_text.split()
raw_text = ' '.join(tokens)

In [11]:
# organize into sequences of characters
length = 60
sequences = list()
sep = 1
for i in range(length, len(raw_text)//sep):
    # select sequence of tokens
    seq = raw_text[(sep*i)-length:(sep*(i+1))]
    # store
    sequences.append(seq)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 9917


In [12]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    
# save sequences to file
out_filename = 'char_sequences.txt'
save_doc(sequences, out_filename)

## Train Language Model

### Load Data

In [13]:
# load
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split('\n')

### Encode Sequences

In [14]:
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))

In [15]:
sequences = list()
for line in lines:
    # integer encode line
    encoded_seq = [mapping[char] for char in line]
    # store
    sequences.append(encoded_seq)

In [16]:
# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 60


In [17]:
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

In [18]:
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y, num_classes=vocab_size)

## Fit Model

In [19]:
# define model
model = Sequential()
model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 75)                40800     
_________________________________________________________________
dense_1 (Dense)              (None, 60)                4560      
Total params: 45,360
Trainable params: 45,360
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, epochs=100, verbose=2)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 21s - loss: 3.0531 - acc: 0.1759
Epoch 2/100
 - 14s - loss: 2.7876 - acc: 0.2504
Epoch 3/100
 - 17s - loss: 2.5880 - acc: 0.3004
Epoch 4/100
 - 14s - loss: 2.4751 - acc: 0.3196
Epoch 5/100
 - 14s - loss: 2.3944 - acc: 0.3325
Epoch 6/100
 - 16s - loss: 2.3368 - acc: 0.3377
Epoch 7/100
 - 20s - loss: 2.2864 - acc: 0.3534
Epoch 8/100
 - 20s - loss: 2.2470 - acc: 0.3684
Epoch 9/100
 - 15s - loss: 2.2117 - acc: 0.3718
Epoch 10/100
 - 14s - loss: 2.1804 - acc: 0.3792
Epoch 11/100
 - 16s - loss: 2.1533 - acc: 0.3866
Epoch 12/100
 - 14s - loss: 2.1263 - acc: 0.3919
Epoch 13/100
 - 15s - loss: 2.1030 - acc: 0.3959
Epoch 14/100
 - 16s - loss: 2.0827 - acc: 0.3993
Epoch 15/100
 - 14s - loss: 2.0596 - acc: 0.4052
Epoch 16/100
 - 13s - loss: 2.0404 - acc: 0.4085
Epoch 17/100
 - 14s - loss: 2.0204 - acc: 0.4127
Epoch 18/100
 - 13s - loss: 2.0014 - acc: 0.4208
Epoch 19/100
 - 14s - loss: 1.9833 - acc: 0.4263
Epoch 20/100
 - 14s - loss: 1.

<keras.callbacks.History at 0x15d509e8>

## Save Model

In [27]:
# save the model to file
model.save('model.h5')

# save the mapping
dump(mapping, open('mapping.pkl', 'wb'))

## Generate Text

In [22]:
# load the model
model = load_model('model.h5')

# load the mapping
mapping = load(open('mapping.pkl', 'rb'))

In [23]:
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text

In [26]:
# test start of rhyme
print(generate_seq(model, mapping, length, 'It was', length))
# test mid-line
print(generate_seq(model, mapping, length, 'It was the best of time', length))
# test not in original
print(generate_seq(model, mapping, length, 'It was the best of tim ', 200))

It was stall, were goush coussengers of the sobles, and the mail, 
It was the best of times, it was the spasting in the larsess, it was the larsengers
It was the best of tim of the checkels of the soods, and the mail, as the spaster on the sublerst of the stonds of the sobles, and the mail, wes guand and a purdering and seventlers, and the surters, and the eachman, we hes
