In [28]:
import os
import time
import numpy as np
import pandas as pd
import tensorflow as tf

In [29]:
data = pd.read_csv('../data/all_haiku.csv')

data = data.iloc[0:1000]

data['haiku'] = data['0']+' '+data['1']+' '+data['2']

data['haiku'] = data['haiku'].apply(lambda x: str(x).strip().replace('-',''))


In [30]:
#Leer los datos

text = ''

for i in data['haiku']:
    text += i + os.linesep

vocab = sorted(set(text))
    
print(f'Text is {len(text)} long')
print(f'Text has {len(vocab)} unique characters')

Text is 50931 long
Text has 81 unique characters


In [34]:
#Procesamiento del texto

char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

text_as_int

array([52, 55, 65, ..., 49, 54,  0])

In [6]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

f
i
s
h
i


In [7]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))


'fishing boats colors of the rainbow\nash wednesday trying to remember  my dream\nsnowy morn pouring ano'
'ther cup of black coffee\nshortest day flames dance in the oven\nhaze half the horse hidden behind the '
'house\nlow sun the lady in red on high heels\nadvent the passing stranger farts\ntarn a bubble in the ic'
"e\nsnowflakes new asphalt in the holes\nCrystal Night'    gusts of rain       outside\nrain the sound of"
' a horse galloping through leaves\nwinter stars suddenly a whiff of perfume\nhungry half of the moon hi'


In [8]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


for input_example, target_example in dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data: ', repr(''.join(idx2char[target_example.numpy()])))
    

Input data:  'fishing boats colors of the rainbow\nash wednesday trying to remember  my dream\nsnowy morn pouring an'
Target data:  'ishing boats colors of the rainbow\nash wednesday trying to remember  my dream\nsnowy morn pouring ano'


In [9]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print(f'Step     {i}')
    print(f'   input: {int(input_idx), repr(idx2char[input_idx])}')
    print(f'   expected output: {int(target_idx), repr(idx2char[target_idx])}')

Step     0
   input: (52, "'f'")
   expected output: (55, "'i'")
Step     1
   input: (55, "'i'")
   expected output: (65, "'s'")
Step     2
   input: (65, "'s'")
   expected output: (54, "'h'")
Step     3
   input: (54, "'h'")
   expected output: (55, "'i'")
Step     4
   input: (55, "'i'")
   expected output: (60, "'n'")


In [10]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [11]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [12]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [13]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [14]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 81) # (batch_size, sequence_length, vocab_size)


In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           20736     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 81)            83025     
Total params: 4,042,065
Trainable params: 4,042,065
Non-trainable params: 0
_________________________________________________________________


In [16]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()


In [17]:
sampled_indices

array([67, 26,  0, 25, 22, 34, 43, 38, 66, 60,  6, 29, 28,  2, 67, 35, 75,
       43, 65, 49, 32, 42, 39, 29, 32, 18, 56, 44,  7, 46, 28, 50, 31, 50,
       67, 11, 80, 36,  4, 52, 40,  1, 57, 53, 60, 72, 22, 25, 71, 73, 17,
       37, 49, 46,  9, 57, 43, 31, 51, 65, 64, 60, 28, 68, 31,  8, 13, 30,
       41, 24, 21, 33,  8, 75, 24, 65, 39, 34, 50, 74, 51, 59, 77,  5,  3,
       55, 51, 79, 57, 15, 20, 45, 24, 44, 48, 24, 52, 39, 38, 16])

In [18]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))


Input: 
 "seashell\ndusk songbirds' voices disappear too\nindian summer rouging her cheeks as she waits for a dr"

Next Char Predictions: 
 'uF\nEBNWRtn,IH!uOéWscLVSIL;jX.ZHdKdu3…P&fT kgnzBEy~:QcZ1kWKesrnHvK06JUDAM0éDsSNd\xa0em–\'"ie’k8?YDXbDfSR9'


In [19]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())


Prediction shape:  (64, 100, 81)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.3937626


In [20]:
model.compile(optimizer='adam', loss=loss)


In [21]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [22]:
EPOCHS=10

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
tf.train.latest_checkpoint(checkpoint_dir)


'./training_checkpoints/ckpt_10'

In [24]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))


In [25]:
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            20736     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 81)             83025     
Total params: 4,042,065
Trainable params: 4,042,065
Non-trainable params: 0
_________________________________________________________________


In [26]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # We pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [27]:
print(generate_text(model, start_string=u"Finish "))


Finish  coferilc te brile theer sin he sin  ant dr'Hrerountend on yinits th. 
and sber win
ha .ucoof tte ri slon oncpadd sur save pook cha ghent rert fan paroifrtrre sor wanthe mopf caingesm on ct fing ule d ainbp wig ofle sita s ley
tuce hisin aldiove smeaa d rasun h
lond baloeal og teerdind phem
f arfiade bat . ansthperpiovesbi lit.
— soobog par sind oushof oeg far:ige lmicer win
ing sreron raraciig  alk of st dimm rs wiing ~ linfrerryst oft
fhe umung sot
wleson
che cuye r ooftdromod bomuac alaoghe th ains lerad homondilter an fas ende lithl on hrgeen watketeslrs Io
rume chil
 yy aduthe hatendhow mnithe s movowaitt wile thes the breg mom flofs.g thecg les koflugk singenc of ve sigl apez yasungt a yif er aof me ghintere shune mf ol bfon
erad te il teok sowike hadis
layar ofth gap's pthala the buthia g bopepindd ululing brog at jar iul rosk end shkescwad cus onhl thresy cuc
lered teadneids cid bae the und at ye
eal omkere
rreind  ald che~ke
tinpinordive fmek dithery won's sf merat witp