In [1]:
import os
import nltk
import string
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [2]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

Using TensorFlow backend.


In [3]:
stopwords = nltk.corpus.stopwords.words('portuguese')

In [4]:
f = open('myfile2.txt', encoding='iso-8859-1')     # Reading a UTF-8 file; 'r' is omitted
rows = f.readlines()
f.close()

In [5]:
plain_text = ''
for music in rows:
    for m in music.split():
        m = ''.join(p for p in m if p not in string.punctuation)
        if m.lower() not in stopwords:
            plain_text += m.lower()+' '

In [6]:
vocab = sorted(set(plain_text))
print ('{} unique characters'.format(len(vocab)))

53 unique characters


In [7]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in plain_text])

In [8]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  ' ' :   0,
  '0' :   1,
  '1' :   2,
  '2' :   3,
  '3' :   4,
  '4' :   5,
  '5' :   6,
  '6' :   7,
  '7' :   8,
  '8' :   9,
  '9' :  10,
  'a' :  11,
  'b' :  12,
  'c' :  13,
  'd' :  14,
  'e' :  15,
  'f' :  16,
  'g' :  17,
  'h' :  18,
  'i' :  19,
  ...
}


In [9]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(plain_text[:13]), text_as_int[:13]))

'nunca vou diz' ---- characters mapped to int ---- > [24 31 24 13 11  0 32 25 31  0 14 19 36]


In [10]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(plain_text)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

n
u
n
c
a


In [11]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'nunca vou dizer realmente penso nunca vou dizer realmente sinto juro juro nunca vou dizer realmente p'
'enso nunca vou dizer realmente sinto juro juro deus juro juro deus confio ninguém confio ninguém conf'
'io ninguém 30 confio ninguém 32 dentes pai dia falou pra nunca mentisse esqueceu dizer verdade ha ha '
'ha ha nunca vou dizer realmente penso nunca vou dizer realmente sinto juro juro juro juro nunca vou d'
'izer realmente penso nunca vou dizer realmente sinto juro juro juro deus juro juro juro deus confio n'


In [12]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [13]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [14]:

# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 512

In [33]:
len(vocab)

53

In [16]:
tf.keras.layers.CuDNNGRU

tensorflow.python.keras.layers.cudnn_recurrent.CuDNNGRU

In [17]:
if tf.test.is_gpu_available():
  rnn = tf.keras.layers.CuDNNGRU
#else:
#  import functools
#  rnn = functools.partial(
#    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [18]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [19]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [20]:
vocab_size

53

In [21]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 53) # (batch_size, sequence_length, vocab_size)


In [22]:
dataset.take(1)

<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           13568     
_________________________________________________________________
lstm (LSTM)                  (64, None, 512)           1574912   
_________________________________________________________________
dense (Dense)                (64, None, 53)            27189     
Total params: 1,615,669
Trainable params: 1,615,669
Non-trainable params: 0
_________________________________________________________________


In [24]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [25]:
sampled_indices

array([12, 26, 42, 46, 13, 29, 24,  2, 16, 32, 41,  1, 18, 31,  7, 49, 25,
       49, 31, 25, 23, 28, 42, 47, 24, 10, 52,  3, 15, 50, 29, 25, 46, 23,
       21, 15,  5, 39, 25, 38,  5, 44, 19, 35, 40, 37, 46, 12, 29,  1, 25,
       14, 50, 39, 48, 13, 41, 17, 23, 41,  2, 52, 28, 49,  3, 48,  3, 23,
       37,  6, 38, 29, 20, 36,  1, 21,  6, 10, 22, 50, 28, 41, 41, 35, 22,
        8,  1, 25, 31, 51, 22,  7, 32,  3, 18, 19, 32, 14, 41, 20],
      dtype=int64)

In [26]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'nada pra acertar errar mão sinto bem assim sinto bem aonde sinto bem assim sinto bem aonde vou nada '

Next Char Predictions: 
 'bpäícsn1fvã0hu6ôoôuomräñn9ü2eõsoímke4áo´4éiyâ³íbs0odõáócãgmã1ürô2ó2m³5´sjz0k59lõrããyl70ouúl6v2hivdãj'


In [27]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 53)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       3.9705963


In [28]:
model.compile(optimizer='adam', loss=loss)

In [29]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [30]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [31]:
EPOCHS=50

In [32]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
