####  https://www.tensorflow.org/tutorials/sequences/text_generation

## Setup

### Import TensorFlow and other libraries

In [4]:
import tensorflow as tf
print(tf.__version__)

tf.enable_eager_execution()

import numpy as np
import os
import time

import numpy as np
import random
import sys
import io
import pandas as pd



1.13.0-dev20181220


In [5]:
root_path = "../"
path_data_clean = root_path + "data/clean/"
dfNoticias = pd.read_pickle(path_data_clean + "/dfNoticiasCleanV2.p")
#dfNoticias.info()

In [6]:
raw_text = ""
count = 0
for index, row in dfNoticias.iterrows():    
    if index < 800: #max aporx 3500
        raw_text += row["Cuerpo"]
        
text = raw_text.lower()

In [7]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

90 unique characters


In [8]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

Use a `tf.keras.callbacks.ModelCheckpoint` to ensure that checkpoints are saved during training:

To keep this quick, train the model for just 3 epochs:

In [9]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

filepath="weights/weights.besttf.hdf5"
checkpointBest = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

In [10]:
# Length of the vocabulary in chars
vocab_size = 90

# The embedding dimension 
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

BATCH_SIZE = 64 #64


In [11]:
if tf.test.is_gpu_available():
  rnn = tf.keras.layers.CuDNNGRU
else:
  import functools
  rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [12]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [13]:
model = build_model(
  vocab_size = vocab_size, 
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units, 
  batch_size=BATCH_SIZE)

Instructions for updating:
Colocations handled automatically by placer.


## Generate text

### Restore the latest checkpoint

To keep this prediction step simple, use a batch size of 1.

Because of the way the RNN state is passed from timestep to timestep, the model only accepts a fixed batch size once built. 

To run the model with a different `batch_size`, we need to rebuild the model and restore the weights from the checkpoint.


In [14]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/ckpt_3'

In [15]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            23040     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3935232   
_________________________________________________________________
dense_1 (Dense)              (1, None, 90)             92250     
Total params: 4,050,522
Trainable params: 4,050,522
Non-trainable params: 0
_________________________________________________________________


### The prediction loop

The following code block generates the text:

* It Starts by choosing a start string, initializing the RNN state and setting the number of characters to generate.

* Get the prediction distribution of the next character using the start string and the RNN state.

* Then, use a multinomial distribution to calculate the index of the predicted character. Use this predicted character as our next input to the model.

* The RNN state returned by the model is fed back into the model so that it now has more context, instead than only one word. After predicting the next word, the modified RNN states are again fed back into the model, which is how it learns as it gets more context from the previously predicted words.



Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.

In [17]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # You can change the start string to experiment
  #start_string = 'c'

  # Converting our start string to numbers (vectorizing) 
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 0.7

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a multinomial distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
      
      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
      
      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))


print("\n\n\n")

inputext = "tres millones de dólares por el volante de contención de Colo Colo"
inputext= inputext.lower()
#inputext = input[-99:-1].lower()

print(generate_text(model, start_string="a"))

Instructions for updating:
Use tf.random.categorical instead.
al estadounidense, cuando se había este tipo de felipe y apareció el formacional de diseños, tras competencias y metiosones, tras la fiscalía con oportunidad se trabajo a un partido con un proyecto de la compañía con el general de chile y preocupación de la presidenta de 4 de octubre 12 años, mil y "el estadio municipales.

el cuadro de mayo
 adicional que la clasificación que dio la fifación de estar jugadores de la compañía en cuanto a los casos de la tira la interpreta de la alemana para el desaloja de preduntas con todos los desarrollar.
"respecto a su objetivo que contra la mejor parece a su experto a la causa, que cierta en la comisión de pena, al final de dete estos mescutantes de la especial de la calidad de ser un mensaje a la cual el nada de este caso el proceso "pedir la compañía de chile en tancos y que el grupo de marcelo sala que convocar la capacudad de la cuentan por 240 entregando a la película de seguridade