In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import tensorflow as tf
from tensorflow import keras

# Este notebook require Tensorflow 2.X

In [2]:
print(tf.__version__)

2.0.0


### Descargar Extracto der Shakespeare 

In [3]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

print(shakespeare_text[:148])

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



### Set de caracteres en la obra (FYI):

In [4]:
"".join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

### Tokenizacion del texto con Keras

In [5]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

# probamos la palabra
tokenizer.texts_to_sequences(["Speak"])

[[8, 23, 2, 5, 25]]

In [6]:
# podemos converir esa secuencia en texto:
tokenizer.sequences_to_texts([[8, 23, 2, 5, 25]])

['s p e a k']

In [7]:
# numero de caracteres distintos
max_id = len(tokenizer.word_index) 

# cantidad total del caracteres
dataset_size = tokenizer.document_count

print("Elementos distintos",max_id)
print("Cantidad de elementos",dataset_size)

Elementos distintos 39
Cantidad de elementos 1115394


### Creamos el Train-Set usando las Funciones de NLP (ver Notebook #1)

In [9]:
# se convierte TODO el texto en secuencias.
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

# se genera un train-set del 90% de las secuencias
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [10]:
# se define que las cadenas son de 100 caracteres con 1 caracter de shift
n_steps = 100
window_length = n_steps + 1 # tamano de la ventana
dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)

# se generan las cadenas planas
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [11]:
# generamos el x_train y y_train usando el mismo codigo del notebook #1
np.random.seed(42)
tf.random.set_seed(42)

# el tamano del batch es de 32 elementos.
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

# se convierte el x_train en un one-hot-encoding de tamano max_id
dataset = dataset.map(
    lambda x_train, y_train: (tf.one_hot(x_train, depth=max_id), y_train))
dataset = dataset.prefetch(1)

# se revisan los tamanos de los tensores generados
for x, y in dataset.take(1):
    print(x.shape, y.shape)

(32, 100, 39) (32, 100)


### Modelo RNN-GRU y Entrenamiento

In [12]:
# arquitecturta
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

# compilacion y entrenamiento
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, steps_per_epoch=train_size // batch_size,
                    epochs=10)

Train for 31370 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
# Guardar el Modelo
model.save('shakespeare-stateless.h5')