### Info 💼

*Dataset*    
https://www.kaggle.com/andreamorgar/spanish-poetry-dataset

<br>

*Links to check*  
https://www.kaggle.com/shivamb/beginners-guide-to-text-generation-using-lstms  
https://www.tensorflow.org/tutorials/text/text_generation  
https://towardsdatascience.com/creating-poems-from-ones-own-poems-neural-networks-and-life-paradoxes-a9cffd2b07e3  
https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/  

### Text Prepocessing 📖



In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
import numpy as np
import pandas as pd

table = pd.read_csv('./gdrive/MyDrive/ColabNotebooks/Text/PoetryGenerator/poems.csv')
table.head()

In [None]:
poems = table['content'].to_list()
len(poems)

In [None]:
print(poems[0][:600])

In [None]:
poems = [str(poem) for poem in poems]

In [None]:
import unicodedata
import re
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [None]:
poems = [re.sub(r'[\t\x85\x91\x92\x93\x94\x96\x97¨«´·»―\uf0bc]', '',str(poem)) for poem in poems]

In [None]:
poemsText = ''.join(poems)
# The unique characters in the file
vocab = sorted(set(poemsText))
print('{} unique characters'.format(len(vocab)))
print(vocab)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

In [None]:
char2id = preprocessing.StringLookup(vocabulary=list(vocab))
id2char = preprocessing.StringLookup(vocabulary=char2id.get_vocabulary(), invert=True)

In [None]:
poemsIDs = char2id([char for char in poemsText])

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(poemsIDs)

In [None]:
seqLen = 100
datasetBatches = dataset.batch(seqLen+1, drop_remainder=True)

In [None]:
datasetBatches.take(1)

In [None]:
def createTrainingPredictions(seq):
  x = seq[:-1]
  y = seq[1:]
  return x, y

In [None]:
dataset = datasetBatches.map(createTrainingPredictions)
dataset

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE))
dataset

### Model 🧠

In [None]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

class PoetryGeneratorModel(tf.keras.Model):
  def __init__(self, vocab_size, embeding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embeding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units, return_sequences=True, return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = PoetryGeneratorModel(vocab_size=len(char2id.get_vocabulary()), embeding_dim=embedding_dim,rnn_units=rnn_units)

In [None]:
model.compile(optimizer='adam', loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [None]:
model.fit(dataset, epochs=30)

In [None]:
model.save_weights('poeta.h5')

### Text Generation ✍🏻

In [None]:
initial = ['S']
initial = [char for char in initial]
initialIDs = char2id(initial)
initialIDs = tf.expand_dims(initialIDs, axis=0)
model(initialIDs, states=None, return_state=False)

In [None]:
poeta  = model
poeta.load_weights('poeta.h5')

In [None]:
def generateText(model, nChars, initialString):
  states = None
  initial = [initialString]
  poem = initial[0]

  for i in range(nChars):
    initial = [char for char in initial]
    initialIDs = char2id(initial)
    initialIDs = tf.expand_dims(initialIDs, axis=0)
    pred, states = model(initialIDs, states=states, return_state=True)
    pred = pred[:, -1, :]
    pred = tf.random.categorical(pred, num_samples=1)
    pred = tf.squeeze(pred, axis=-1)
    initial = id2char(pred)
    poem += id2char(pred)[0].numpy().decode('utf-8')

  poem = poem.split('\n')
  poem = '\n'.join([line.strip() for line in poem])
  poem = re.sub(' +', ' ', poem)
  print(poem, '\n')

In [None]:
generateText(poeta, 500, 'E')