In [2]:
import numpy as np

with open('../../1268-0.txt', 'r') as fp:
    text = fp.read()
    
start_index = text.find('THE MYSTERIOUS ISLAND')
end_index = text.find('End of the Project Gutenberg')
text = text[start_index:end_index]
char_set = set(text)
print('Total length: ', len(text))

Total length:  1112350


In [4]:
print('Unique Characters: ', len(char_set))

Unique Characters:  80


In [6]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

text_encoded = np.array([char2int[ch] for ch in text],
                       dtype = np.int32)
print('Text encoded shape: ', text_encoded.shape)

Text encoded shape:  (1112350,)


In [7]:
print(text[:15], '--encoding---> ', text_encoded[:15])

THE MYSTERIOUS  --encoding--->  [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]


In [9]:
print(text_encoded[15:21], '---reverse---> ', ''.join(char_array[text_encoded[15:21]]))

[33 43 36 25 38 28] ---reverse--->  ISLAND


In [11]:
# create a tensorflow dataset
import tensorflow as tf
ds_text_encoded = tf.data.Dataset.from_tensor_slices(text_encoded)
# verify
for ex in ds_text_encoded.take(5):
    print('{} -> {}'.format(ex.numpy(), char_array[ex.numpy()]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [15]:
seq_length = 40
chunk_size = seq_length + 1
ds_chunks = ds_text_encoded.batch(chunk_size, drop_remainder=True)

# function to split x and y
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq

ds_sequences = ds_chunks.map(split_input_target)

In [17]:
# verify
for example in ds_sequences.take(2):
    print('Input, x: ',repr(''.join(char_array[example[0].numpy()])))
    print('Target, y: ',repr(''.join(char_array[example[1].numpy()])))

Input, x:  'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'
Target, y:  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'
Input, x:  ' Anthony Matonak, and Trevor Carlson\n\n\n\n'
Target, y:  'Anthony Matonak, and Trevor Carlson\n\n\n\n\n'


In [18]:
# divide the dataset into mini-batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000
ds = ds_sequences.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

In [19]:
# define an RNN model using Keras Sequential class
def build_model(vocab_size, embedding_dim, rnn_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(rnn_units,
                            return_sequences = True),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [20]:
# training parameters
charset_size = len(char_array)
embedding_dim = 256
rnn_units = 512

In [21]:
# define model
tf.random.set_seed(1)
model = build_model(vocab_size = charset_size,
                   embedding_dim = embedding_dim,
                   rnn_units = rnn_units)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         20480     
_________________________________________________________________
lstm (LSTM)                  (None, None, 512)         1574912   
_________________________________________________________________
dense (Dense)                (None, None, 80)          41040     
Total params: 1,636,432
Trainable params: 1,636,432
Non-trainable params: 0
_________________________________________________________________


In [None]:
# train model
model.compile(optimizer = 'adam',
             loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
model.fit(ds, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20