In [24]:
try:
    %tensorflow_version 2.x
    %load_ext tensorboard
except Exception:
    pass
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os, random
import re

tf.random.set_seed(123)
np.random.seed(123)

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [2]:
def load_corpus(path):
  """
  path: Path relativo a este script
  Esta función carga el corpus para su procesamiento y lectura en la red neuronal
  """
  with open(path) as f:
    corpus = f.readlines()
    corpus = ' '.join(corpus)
  return corpus

In [3]:
def vectorized_text(corpus):
  """
  corpus: str, texto completo a usar para entrenar el modelo.
  Función que recibe un texto y lo devuelve en una representación númerica.
  """
  
  #Vocabulario, caracteres unicos que aparecen en el corpus
  vocab = sorted(np.unique(list(set(corpus))))
  #Asignar un valor numerico a cada caracter
  char2idx = {u:i for i, u in enumerate(vocab)}
  #letra asignada al vocab
  idx2char = np.array(vocab)

  text_as_int = np.array([char2idx[c] for c in corpus])

  return text_as_int, idx2char

In [4]:
def split_input_target(chunk):
  """
  Variable objetivo a predecir
  """
  input_text = chunk[:-1]
  target_text = chunk[1:]
  return input_text, target_text

In [5]:
def dataset_preparation(text_as_int, len_corpus, seq_length=100, buffer_size = 10000, batch_size = 64):
  """
  Función que devuelve el numero de ejemplos por epoch y el dataset procesado
  """
  examples_per_epoch = len_corpus // (seq_length + 1) 
  char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
  sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
  dataset = sequences.map(split_input_target).shuffle(buffer_size).batch(batch_size, drop_remainder=True)
  return examples_per_epoch, dataset

In [6]:
def train_test_split(dataset):
  """
  Separación en train y split
  """
  test_dataset = dataset.take(int(len(list(dataset))*0.2))
  train_dataset = dataset.skip(int(len(list(dataset))*0.2))
  return (test_dataset, train_dataset)

In [7]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [8]:
def model_definition(vocab, embedding_dim = 256, rnn_units = 1024, buffer_size = 10000, batch_size = 64, recurrent_initializer = 'glorot_uniform', dense_dim = 128, activation = "relu", dropout = 0.2):
  """
  Definición del modelo LSTM que incluye una capa de embedding, la capa LSTM, 2 capas densas y un Dropout
  """
  model = tf.keras.Sequential([tf.keras.layers.Embedding(len(vocab), embedding_dim,
                                                            batch_input_shape=[batch_size, None]),
                                  tf.keras.layers.LSTM(rnn_units,
                                                       return_sequences=True, #este argumento hace que el modelo sea many-to-many
                                                       stateful=True,
                                                       recurrent_initializer = recurrent_initializer),
                                  tf.keras.layers.Dense(dense_dim,
                                                        activation="relu"),
                                  tf.keras.layers.Dropout(dropout),
                                  tf.keras.layers.Dense(len(vocab))])
  return model

In [9]:
def compile_model(model, optimizer, loss, metrics):
  """
  Función para compilar el modelo
  """
  model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [10]:
def fit_model(model, train_dataset, test_dataset, epochs, examples_per_epoch, validation_steps, batch_size=64, saving_path = 'lstm.h5'):
  """
  Función de entrenamiento del modelo
  """
  checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=saving_path,
    save_weights_only=True, save_best_only = True)
  model.fit(train_dataset.repeat(), validation_data = test_dataset.repeat(), epochs=epochs, callbacks=[checkpoint_callback], 
                 steps_per_epoch=examples_per_epoch//batch_size, validation_steps = validation_steps)

In [66]:
def generate_text(model, start_string, vocab, text_len=500, temperature=0.5):

    """
    Función que genera texto de manera automática a partir de predecir el siguiente caracter mas probable
    recibe como input el modelo, un texto plano, un vocabulario y una temperatura para ajustar el resultado.
    """


    # vectorizamos el string inicial
    char2idx = {u:i for i, u in enumerate(vocab)}
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Lista para guardar los resultados
    text_generated = []

    # Reiniciamos los estados del modelo
    model.reset_states()
    # iteramos para obtener el número de carácteres deseado
    for i in range(text_len):
        # obtenemos las predicciones
        predictions = model(input_eval)
        # removemos el eje de los batch
        predictions = tf.squeeze(predictions, 0)

        # utilizamos la distribución categorica para obtener el siguiente caracter
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        # predicted_id es el caracter predicho (este será la entrada en la siguiente iteración)
        input_eval = tf.expand_dims([predicted_id], 0)
        # agregamos el string correspondiente al id predicho
        text_generated.append(idx2char[predicted_id])
    return (start_string + ''.join(text_generated))

In [59]:
def export_generated_lyrics(path, generated_lyrics):
  with open(path, "w") as f:
    writer = f.write(generated_lyrics)

In [46]:
corpus = load_corpus('corpus.txt')

In [47]:
corpus = re.sub(' +', ' ',re.sub(r"\xa0|\u2005|\u200a|\u205f|ah|Oh|hey|yeah", " ", corpus))

In [48]:
text_as_int, idx2char = vectorized_text(corpus)

In [49]:
examples_per_epoch, dataset =  dataset_preparation(text_as_int = text_as_int, len_corpus = len(corpus))

In [50]:
test_dataset, train_dataset = train_test_split(dataset)

In [51]:
model = model_definition(vocab = idx2char)

In [52]:
compile_model(model, optimizer = "adam", loss = loss, metrics = 'accuracy')

In [18]:
fit_model(model, train_dataset, test_dataset, 1, examples_per_epoch, validation_steps = 48)



In [53]:
model_lstm = model_definition(vocab = idx2char, batch_size= 1)

In [54]:
model_lstm.load_weights("lstm.h5")

In [74]:
random_lyrics =generate_text(model = model_lstm, start_string= "This is the story of a man who", vocab = idx2char, text_len = 1200, temperature = 0.65)

In [75]:
print(random_lyrics)

This is the story of a man who so bridges
And drop a hotel brand new
Stop this life you're terisa a couple of pain life, pass you, but you're hard to love

And I don't know why your world was living in
I'll showed you my life
I'd like to close my eyes, I'll be there to short the risk
Let yourself go, let your mind as a dream won't leave my life to me what you gave me
I said, Curound down in the summer plastic feels like a nesplayer
So when I rat it all
Yes we all along
The father of falling apart
I won't let you sleep
I can't do this anymore
What are you waiting for
Aren't you tired of all of the violence inside of you
Handsome home
Home, one last time

I need something to love, you can run with me
The moment you forgive me
No one understand
Girl, you and I will die unbelievers
Bound to the top
You're gonna stand my grip, but my shadow on me
The sky is a neighborhood 
 
 
 
 
 Show me the dirt pile and I will give you my fan you're all worth it
For you to find something to be patient
T

In [76]:
export_generated_lyrics("generated_lyrics.txt", random_lyrics)