In [5]:
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np

In [6]:
# file from computer
# file_name = input('Insert file name and extension')

# file from google
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

# read and codification for py2
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Text codification (each word should be encoded as an integer)
vocab = sorted(set(text)) # unique words mapping

char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# Encode text to integers
# String -> Array<integer>
def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

# Decode integers to text
# Array<Integers> -> String
def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])

# Example to verify
print("Text:", text[:13]) # 13 first char
print("Encoded:", text_to_int(text[:13])) # Array with 13 encoded ints
print("Decoded:", int_to_text(text_as_int[:13])) # Original 13 first chars decoded

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Text: First Citizen
Encoded: [18 47 56 57 58  1 15 47 58 47 64 43 52]
Decoded: First Citizen


In [7]:
# To train the model, we feed it with a determinated amount of characters from the text
# e.g. input: characters from index 0 to index 99, output: characters from index 1 to index 100
# e.g. Hell -> ello (Hello)
# we have to set the sequence length and the number of examples for each epoch
seq_length = 100 
examples_per_epoch = len(text) // (seq_length + 1)

# create training examples/objectives
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

# Take the chars and spare between input and output
# String -> String, String
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target) # apply the function to each input

In [9]:
# Parameters
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [11]:
# Model building
def build_model(vocab_size, embbedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embbedding_dim, batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary() 

# 
for input_example_batch, target_example_batch in data.take(1):
    example_batch_predictions = model(input_example_batch) 
    print(example_batch_predictions.shape) # (64, 100, 65)
    # batch_size, sequence_length, vocab_size

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (64, None, 256)           16640     
                                                                 
 lstm_1 (LSTM)               (64, None, 1024)          5246976   
                                                                 
 dense_1 (Dense)             (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________
(64, 100, 65)


In [12]:
# Loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

# Compiling model
model.compile(optimizer='adam', loss=loss)

# Creating Checkpoint
checkpoint_dir = '/.training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt-{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

In [None]:
# Training from zero
EPOCHS = 40

history = model.fit(data, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [None]:
# Training from a checkpoint
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

# Load the most recent checkpoint
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

# Load a specific checkpoint
checkpoint_num = 10
model.load_weights(tf.train.load_checkpoint("./training_checkpoints/ckpt_" + str(checkpoint_num)))
model.build(tf.TensorShape([1, None]))

In [None]:
# Generating Text
# Model, String -> String
def generate_text(model, start_string):
    # character to generate
    num_generate = 800

    # Vectorizing string to numbers
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    # Low temp -> More predictable
    # High temp -> more surprising
    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

inp = input('Type a starting string')
print(generate_text(model, inp))
