# Text generation using Keras and Tensorflow

From: https://machinelearningmastery.com/develop-character-based-neural-language-model-keras/

## Load Data

In [1]:
from numpy import array
from pickle import dump
from pickle import load
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [10]:
# load text
raw_text = load_doc('austen.txt')

raw_text = raw_text[:10000]

In [11]:
# clean
tokens = raw_text.split()
raw_text = ' '.join(tokens)

In [12]:
# organize into sequences of characters
length = 60
sequences = list()
sep = 1
for i in range(length, len(raw_text)//sep):
    # select sequence of tokens
    seq = raw_text[(sep*i)-length:(sep*(i+1))]
    # store
    sequences.append(seq)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 9877


In [13]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
    
# save sequences to file
out_filename = 'char_sequences.txt'
save_doc(sequences, out_filename)

## Train Language Model

### Load Data

In [14]:
# load
in_filename = 'char_sequences.txt'
raw_text = load_doc(in_filename)
lines = raw_text.split('\n')

### Encode Sequences

In [15]:
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))

In [16]:
sequences = list()
for line in lines:
    # integer encode line
    encoded_seq = [mapping[char] for char in line]
    # store
    sequences.append(encoded_seq)

In [17]:
# vocabulary size
vocab_size = len(mapping)
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 58


In [18]:
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]

In [19]:
sequences = [to_categorical(x, num_classes=vocab_size) for x in X]
X = array(sequences)
y = to_categorical(y, num_classes=vocab_size)

## Fit Model

In [20]:
# define model
model = Sequential()
model.add(LSTM(75, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 75)                40200     
_________________________________________________________________
dense_1 (Dense)              (None, 58)                4408      
Total params: 44,608
Trainable params: 44,608
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, epochs=100, verbose=2)

Instructions for updating:
Use tf.cast instead.
Epoch 1/100
 - 13s - loss: 3.0483 - acc: 0.1704
Epoch 2/100
 - 13s - loss: 2.8159 - acc: 0.2101
Epoch 3/100
 - 14s - loss: 2.6008 - acc: 0.2796
Epoch 4/100
 - 19s - loss: 2.4695 - acc: 0.3082
Epoch 5/100
 - 19s - loss: 2.3837 - acc: 0.3315
Epoch 6/100
 - 16s - loss: 2.3159 - acc: 0.3415
Epoch 7/100
 - 14s - loss: 2.2576 - acc: 0.3583
Epoch 8/100
 - 15s - loss: 2.2061 - acc: 0.3703
Epoch 9/100
 - 14s - loss: 2.1619 - acc: 0.3775
Epoch 10/100
 - 16s - loss: 2.1133 - acc: 0.3932
Epoch 11/100
 - 15s - loss: 2.0716 - acc: 0.4042
Epoch 12/100
 - 18s - loss: 2.0332 - acc: 0.4120
Epoch 13/100
 - 17s - loss: 1.9970 - acc: 0.4191
Epoch 14/100
 - 14s - loss: 1.9659 - acc: 0.4297
Epoch 15/100
 - 14s - loss: 1.9320 - acc: 0.4375
Epoch 16/100
 - 19s - loss: 1.9049 - acc: 0.4458
Epoch 17/100
 - 18s - loss: 1.8780 - acc: 0.4486
Epoch 18/100
 - 15s - loss: 1.8511 - acc: 0.4581
Epoch 19/100
 - 16s - loss: 1.8270 - acc: 0.4673
Epoch 20/100
 - 14s - loss: 1.

<keras.callbacks.History at 0x15c685c0>

## Save Model

In [22]:
# save the model to file
model.save('model.h5')

# save the mapping
dump(mapping, open('mapping.pkl', 'wb'))

## Generate Text

In [23]:
# load the model
model = load_model('model.h5')

# load the mapping
mapping = load(open('mapping.pkl', 'rb'))

In [24]:
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # one hot encode
        encoded = to_categorical(encoded, num_classes=len(mapping))
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text

In [35]:
# test start of rhyme
print(generate_seq(model, mapping, length, 'The family of Dashwood had lon', length*2))
# test mid-line
print(generate_seq(model, mapping, length, 'settled in sussex', length*2))
# test not in original
print(generate_seq(model, mapping, length, 'hello worl', length*2))

The family of Dashwood had long many prease which had an encunction to his well how on for the sisters, was as secundsine many proprestion were theirs
settled in sussexs, which advantinion, to him entered; and which in his eas, the and in a romish them and then shemself wishly her herste
hello worly, and fare of his wife and sane as a poredanted by his nepenter and chearter and has a trould was arracy on the conditi
