In [1]:
import collections
from typing import List, Set, Dict, Tuple, Generator

import numpy as np
import pandas as pd
import scipy
import tensorflow as tf

tf.enable_eager_execution()


CAMINHO_MODELO = "modelo.json"
CAMINHO_DICIONARIO = "dicionario.json"
CAMINHO_DICIONARIO_INDICES = "dicionario_indices.json"
EMBEDDING_UNITS = 128
LSTM_UNITS = 256

In [2]:
def criar_dicionario(palavras: List[str], minimo_palavras: int) -> (Dict[str, int], Dict[int, str], Dict[str, int]):
    dicionario_freq = collections.Counter(palavras)
    palavras_no_dicionario = [palavra for palavra, qtd in dicionario_freq.items() if qtd > minimo_palavras]
    dicionario = {palavra: indice for (indice, palavra) in enumerate(palavras_no_dicionario)}
    dicionario["UNK"] = len(dicionario)
    dicionario_indices = dict(zip(dicionario.values(), dicionario.keys()))
    return dicionario, dicionario_indices, dicionario_freq
df_obras = pd.read_csv("./obras_machado_de_assis.csv")
df_obras = df_obras[df_obras["categoria"] != "tradução"]

minimo_palavras_frase = 2
dataset = pd.Series(np.concatenate(df_obras["texto"].str.replace("\n+", " ").str.replace("\.+", ".").str.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s").values))
dataset = dataset.str.replace("[^\w\s\d]", "").str.strip().str.lower()
dataset = dataset[dataset.apply(lambda row: len(row.split())) > minimo_palavras_frase].reset_index(drop=True)
palavras = np.concatenate(dataset.apply(lambda row: row.split()))

minimo_palavras_dicionario = 2
dicionario, dicionario_indices, dicionario_freq = criar_dicionario(palavras, minimo_palavras_dicionario)
dataset = dataset.apply(lambda row: [dicionario.get(palavra, len(dicionario)-1) for palavra in row.split()])
tamanho_dicionario = len(dicionario)
tamanho_dataset = len(dataset)

In [3]:
class Model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Model, self).__init__()
        self.units = units
        self.batch_size = batch_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_activation='sigmoid',
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size) 
    def call(self, inputs, hidden):
        inputs = self.embedding(inputs)
        output, states = self.gru(inputs, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, states

BATCH_SIZE = 100
modelo = Model(len(dicionario), EMBEDDING_UNITS, LSTM_UNITS, BATCH_SIZE)
# modelo.build((10,))
# modelo.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["acc"])
# modelo, modelo_embedding = criar_modelo(len(dicionario), EMBEDDINGS)
# modelo_embedding.summary()
optimizer = tf.train.AdamOptimizer()
def loss_function(labels, logits):
    return tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

X, Y = [], []
for d in dataset:
    for i in range(1, len(d)):
        X.append(d[i-1])
        Y.append(d[i])

X, Y = np.array(X).reshape((-1, 1)), np.array(Y).reshape(-1, 1)
ds = tf.data.Dataset.from_tensor_slices((X, Y))
ds = ds.batch(BATCH_SIZE, drop_remainder=True)


for epoch in range(1):
    hidden = modelo.reset_states()
    for (batch, (input, target)) in enumerate(ds):
        with tf.GradientTape() as tape:
            predictions, hidden = modelo(input, hidden)
            target = tf.reshape(target, (-1,))
            loss = loss_function(target, predictions)
            grads = tape.gradient(loss, modelo.variables)
            optimizer.apply_gradients(zip(grads, modelo.variables))
            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss{:.4f}'.format(epoch + 1, batch, loss))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1 Batch 0 Loss10.2544
Epoch 1 Batch 100 Loss7.8267
Epoch 1 Batch 200 Loss8.0981
Epoch 1 Batch 300 Loss8.1649
Epoch 1 Batch 400 Loss7.0558
Epoch 1 Batch 500 Loss7.4419
Epoch 1 Batch 600 Loss6.8362
Epoch 1 Batch 700 Loss6.2766
Epoch 1 Batch 800 Loss6.5017
Epoch 1 Batch 900 Loss6.3314
Epoch 1 Batch 1000 Loss5.9891
Epoch 1 Batch 1100 Loss7.2984
Epoch 1 Batch 1200 Loss6.6151
Epoch 1 Batch 1300 Loss7.4415
Epoch 1 Batch 1400 Loss6.7402
Epoch 1 Batch 1500 Loss6.3960
Epoch 1 Batch 1600 Loss7.3402
Epoch 1 Batch 1700 Loss7.1340
Epoch 1 Batch 1800 Loss6.6472
Epoch 1 Batch 1900 Loss6.8760
Epoch 1 Batch 2000 Loss6.3845
Epoch 1 Batch 2100 Loss6.4398
Epoch 1 Batch 2200 Loss6.4507
Epoch 1 Batch 2300 Loss6.2884
Epoch 1 Batch 2400 Loss7.1156
Epoch 1 Batch 2500 Loss6.3470
Epoch 1 Batch 2600 Loss6.1039
Epoch 1 Batch 2700 Loss6.3864
Epoch 1 Batch 2800 Loss6.9723
Epoch 1 Batch 2900 Lo

KeyboardInterrupt: 

In [14]:
start_string = "Eduardo viajou não sei que"

input_eval = [dicionario.get(string) for string in start_string.lower().split()]
input_eval = tf.expand_dims(input_eval, 0)

text_generated = ''

hidden = [tf.zeros((1, LSTM_UNITS))]

predictions, hidden = modelo(input_eval, hidden)

predicted_id = tf.argmax(predictions[-1]).numpy()

text_generated += " " + dicionario_indices[predicted_id]

print(start_string + text_generated)

Eduardo viajou não sei que o


In [9]:
for epoch in range(2):
    hidden = modelo.reset_states()
    for (batch, (input, target)) in enumerate(ds):
        with tf.GradientTape() as tape:
            predictions, hidden = modelo(input, hidden)
            target = tf.reshape(target, (-1,))
            loss = loss_function(target, predictions)
            grads = tape.gradient(loss, modelo.variables)
            optimizer.apply_gradients(zip(grads, modelo.variables))
            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss{:.4f}'.format(epoch + 1, batch, loss))

Epoch 1 Batch 0 Loss6.7043
Epoch 1 Batch 100 Loss5.9692
Epoch 1 Batch 200 Loss6.0786
Epoch 1 Batch 300 Loss6.6230
Epoch 1 Batch 400 Loss5.5928
Epoch 1 Batch 500 Loss6.2438
Epoch 1 Batch 600 Loss5.8269
Epoch 1 Batch 700 Loss5.2772
Epoch 1 Batch 800 Loss5.5680
Epoch 1 Batch 900 Loss5.5921
Epoch 1 Batch 1000 Loss5.0857
Epoch 1 Batch 1100 Loss6.2973
Epoch 1 Batch 1200 Loss5.6218
Epoch 1 Batch 1300 Loss6.6073
Epoch 1 Batch 1400 Loss5.7490
Epoch 1 Batch 1500 Loss5.8344
Epoch 1 Batch 1600 Loss6.6077
Epoch 1 Batch 1700 Loss6.1136
Epoch 1 Batch 1800 Loss5.9632
Epoch 1 Batch 1900 Loss5.8360
Epoch 1 Batch 2000 Loss5.6364
Epoch 1 Batch 2100 Loss5.9290
Epoch 1 Batch 2200 Loss5.8303
Epoch 1 Batch 2300 Loss5.8793
Epoch 1 Batch 2400 Loss6.1046
Epoch 1 Batch 2500 Loss5.6369
Epoch 1 Batch 2600 Loss5.7357
Epoch 1 Batch 2700 Loss5.7923
Epoch 1 Batch 2800 Loss6.3765
Epoch 1 Batch 2900 Loss6.2885
Epoch 1 Batch 3000 Loss6.0760
Epoch 1 Batch 3100 Loss5.7663
Epoch 1 Batch 3200 Loss5.3522
Epoch 1 Batch 3300 Los

Epoch 2 Batch 11200 Loss5.6349
Epoch 2 Batch 11300 Loss5.3836
Epoch 2 Batch 11400 Loss5.1710
Epoch 2 Batch 11500 Loss5.3312
Epoch 2 Batch 11600 Loss5.8600
Epoch 2 Batch 11700 Loss5.8938
Epoch 2 Batch 11800 Loss6.0598
Epoch 2 Batch 11900 Loss5.6992
Epoch 2 Batch 12000 Loss5.6496
Epoch 2 Batch 12100 Loss5.6777
Epoch 2 Batch 12200 Loss5.5619
Epoch 2 Batch 12300 Loss5.3040
Epoch 2 Batch 12400 Loss5.8656
Epoch 2 Batch 12500 Loss5.8522
Epoch 2 Batch 12600 Loss5.8139
Epoch 2 Batch 12700 Loss6.2198
Epoch 2 Batch 12800 Loss6.2507
Epoch 2 Batch 12900 Loss6.1778
Epoch 2 Batch 13000 Loss5.8944
Epoch 2 Batch 13100 Loss6.4996
Epoch 2 Batch 13200 Loss5.9483
Epoch 2 Batch 13300 Loss5.4229
Epoch 2 Batch 13400 Loss5.9396
Epoch 2 Batch 13500 Loss5.0432
Epoch 2 Batch 13600 Loss5.8320
Epoch 2 Batch 13700 Loss5.7045
Epoch 2 Batch 13800 Loss5.7670
Epoch 2 Batch 13900 Loss5.8386
Epoch 2 Batch 14000 Loss5.4210
Epoch 2 Batch 14100 Loss5.6278
Epoch 2 Batch 14200 Loss5.5313
Epoch 2 Batch 14300 Loss5.6588
Epoch 2 