In [1]:
import os
import time
import numpy as np
import pandas as pd
import tensorflow as tf

In [4]:
data = pd.read_csv('../data/all_haiku.csv')

data['haiku'] = data['0']+' '+data['1']+' '+data['2']

data['haiku'] = data['haiku'].apply(lambda x: str(x).strip().replace('-',''))


In [5]:
#Leer los datos

text = ''

for i in data['haiku']:
    text += i + os.linesep

vocab = sorted(set(text))
    
print(f'Text is {len(text)} long')
print(f'Text has {len(vocab)} unique characters')

Text is 9860268 long
Text has 105 unique characters


In [6]:
#Procesamiento del texto

char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [8]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

f
i
s
h
i


In [9]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))


'fishing boats colors of the rainbow\nash wednesday trying to remember  my dream\nsnowy morn pouring ano'
'ther cup of black coffee\nshortest day flames dance in the oven\nhaze half the horse hidden behind the '
'house\nlow sun the lady in red on high heels\nadvent the passing stranger farts\ntarn a bubble in the ic'
"e\nsnowflakes new asphalt in the holes\nCrystal Night'    gusts of rain       outside\nrain the sound of"
' a horse galloping through leaves\nwinter stars suddenly a whiff of perfume\nhungry half of the moon hi'


In [10]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


for input_example, target_example in dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data: ', repr(''.join(idx2char[target_example.numpy()])))
    

Input data:  'fishing boats colors of the rainbow\nash wednesday trying to remember  my dream\nsnowy morn pouring an'
Target data:  'ishing boats colors of the rainbow\nash wednesday trying to remember  my dream\nsnowy morn pouring ano'


In [11]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print(f'Step     {i}')
    print(f'   input: {int(input_idx), repr(idx2char[input_idx])}')
    print(f'   expected output: {int(target_idx), repr(idx2char[target_idx])}')

Step     0
   input: (66, "'f'")
   expected output: (69, "'i'")
Step     1
   input: (69, "'i'")
   expected output: (79, "'s'")
Step     2
   input: (79, "'s'")
   expected output: (68, "'h'")
Step     3
   input: (68, "'h'")
   expected output: (69, "'i'")
Step     4
   input: (69, "'i'")
   expected output: (74, "'n'")


In [13]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder = True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [14]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024

In [15]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [16]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [17]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


(64, 100, 105) # (batch_size, sequence_length, vocab_size)


In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           26880     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 105)           107625    
Total params: 4,072,809
Trainable params: 4,072,809
Non-trainable params: 0
_________________________________________________________________


In [19]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()


In [20]:
sampled_indices

array([ 88,  41,  20,  28,  60,  84,  87,  80,  88,  72,  16,  19,   4,
        37,   4,   3,  50,  60,  64,  66,  98,  86, 101,  10, 102,  14,
        43,  85,  12,  98,  28,  25,  96,  56,  46,  48,  26,   7,  69,
        96,  94,  58,  48,  18,  20,  91,  68,  12,  93,   4,  47,  36,
        68,  72,  42,  60, 104,   6,   7,  86,  22,  59,  23,  85,  87,
        47,  20,   3,  59,  81,  68,  88,  97,  73,  27,  90,  68,  19,
        72, 100,   6,  90,  72,  16,  45,  39,  62,  64,  53,  21,  81,
        28,  66,   7,  54,  26,  60,  67,  69,  14])

In [21]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))


Input: 
 'ainst a spider thread\nskylight a contrail crosses her line of coke\nantique store the ocean loud in a'

Next Char Predictions: 
 '~K6>`x{t~l25%G%"T`df–z’+“0My.–>;ūZPR<(iūü]R46äh.ï%QFhlL`…\'(z8_9y{Q6"_uh~ŭm=àh5l‘\'àl2OIbdW7u>f(X<`gi0'


In [22]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())


Prediction shape:  (64, 100, 105)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.655535


In [23]:
model.compile(optimizer='adam', loss=loss)


In [24]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [25]:
EPOCHS=10

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])


Epoch 1/10
  48/1525 [..............................] - ETA: 3:01:34 - loss: 3.2309

KeyboardInterrupt: 