# Training a RNN to learn sequences from "Don Quijote de la Mancha"

## Requirements:
- Tensorflow: here I'm using tensorflow 2.1.0
- Numpy
- Tqdm
- Matplotlib

## Import libraries

In [4]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import os

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense
print(f"Tensorflow version: {tf.__version__}")

Tensorflow version: 2.1.0


## Load data

In [24]:
# Read file
with open(os.path.join(r'txt_source', r'el_quijote.txt')) as f:
    txt = f.read()

# Create idx - character converters
idx2char = np.array(sorted(set(txt)))
char2idx = {c:i for i, c in enumerate(idx2char)}

print(f"Found {len(idx2char)} characters")

# Vectorize txt
vect_txt = np.array([char2idx[c] for c in txt])

Found 89 characters


## Batch creation

In [30]:
def get_batch(vect_txt, seq_length, batch_size):
    # Get indexes
    idx = np.random.choice(vect_txt.shape[0] - seq_length - 1, batch_size)
    
    # Create batches
    input_batch = [vect_txt[i:i+seq_length] for i in idx]
    output_batch = [vect_txt[i+1:i+seq_length+1] for i in idx]
    
    # Reshape
    x_batch = np.reshape(input_batch, [batch_size, seq_length])
    y_batch = np.reshape(output_batch, [batch_size, seq_length])
    return x_batch, y_batch

## Define model

In [32]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        # Layer 1: Embedding layer to transform indices into dense vectors 
        #   of a fixed embedding size
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),

        # Layer 2: LSTM with `rnn_units` number of units. 
        tf.keras.layers.LSTM(
            rnn_units, 
            return_sequences=True, 
            recurrent_initializer='glorot_uniform',
            recurrent_activation='sigmoid',
            stateful=True,
        ),

        # Layer 3: Dense (fully-connected) layer that transforms the LSTM output
        #   into the vocabulary size. 
        tf.keras.layers.Dense(vocab_size)
    ])

    return model

model = build_model(len(idx2char), embedding_dim=256, rnn_units=1024, batch_size=32)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (32, None, 256)           22784     
_________________________________________________________________
lstm (LSTM)                  (32, None, 1024)          5246976   
_________________________________________________________________
dense (Dense)                (32, None, 89)            91225     
Total params: 5,360,985
Trainable params: 5,360,985
Non-trainable params: 0
_________________________________________________________________


In [42]:
x, y = get_batch(vect_txt, seq_length=100, batch_size=32)
pred = model(x)
print(x.shape, pred.numpy().shape)

# To get the output we don't use argmax (wich apparently may cause the model to be enter a loop),
# instad we sample from the distribution.
sampled_indices = tf.random.categorical(pred[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices

print("Input: \n", repr("".join(idx2char[x[0]])))
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices])))



InvalidArgumentError: Incompatible shapes: [32,4096] vs. [4,4096] [Op:AddV2] name: sequential_1/lstm_1/add/

## Hyperparameter

In [37]:
# Optimization parameters:
num_training_iterations = 2000  # Increase this to train longer
batch_size = 4  # Experiment between 1 and 64
seq_length = 100  # Experiment between 50 and 500
learning_rate = 5e-3  # Experiment between 1e-5 and 1e-1

# Model parameters: 
vocab_size = len(idx2char)
embedding_dim = 256 
rnn_units = 1024  # Experiment between 1 and 2048

# Checkpoint location: 
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "my_ckpt")

## Training

- Train step

In [38]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)
optimizer = tf.keras.optimizers.Adam(learning_rate)

@tf.function
def train_step(x, y): 
    with tf.GradientTape() as tape:
        y_pred = model(x)
        loss = tf.keras.losses.sparse_categorical_crossentropy(y, y_pred, from_logits=True)
        
        grads = tape.gradient(loss, model.trainable_variables)

        # Apply the gradients with the optimizer
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        return loss

- Train loop

In [None]:
history = []

for iter in tqdm(range(num_training_iterations)):
    # Grab a batch and propagate it through the network
    x_batch, y_batch = get_batch(vect_txt, seq_length, batch_size)
    loss = train_step(x_batch, y_batch)

    # Update the progress bar
    history.append(loss.numpy().mean())
    
    # Update the model with the changed weights!
    if iter % 100 == 0: 
        model.save_weights(checkpoint_prefix)
    
# Save the trained model and the weights
model.save_weights(checkpoint_prefix)

  1%|          | 23/2000 [00:50<1:07:26,  2.05s/it]