In [221]:
import collections
from typing import List, Set, Dict, Tuple, Generator

import numpy as np
import pandas as pd
import scipy
import tensorflow as tf

CAMINHO_MODELO = "modelo.json"
CAMINHO_DICIONARIO = "dicionario.json"
CAMINHO_DICIONARIO_INDICES = "dicionario_indices.json"
EMBEDDING_UNITS = 128
LSTM_UNITS = 128

In [5]:
def criar_dicionario(palavras: List[str], minimo_palavras: int) -> (Dict[str, int], Dict[int, str], Dict[str, int]):
    dicionario_freq = collections.Counter(palavras)
    palavras_no_dicionario = [palavra for palavra, qtd in dicionario_freq.items() if qtd > minimo_palavras]
    dicionario = {palavra: indice for (indice, palavra) in enumerate(palavras_no_dicionario)}
    dicionario["UNK"] = len(dicionario)
    dicionario_indices = dict(zip(dicionario.values(), dicionario.keys()))
    return dicionario, dicionario_indices, dicionario_freq
df_obras = pd.read_csv("./obras_machado_de_assis.csv")
df_obras = df_obras[df_obras["categoria"] != "tradução"]

minimo_palavras_frase = 2
dataset = pd.Series(np.concatenate(df_obras["texto"].str.replace("\n+", " ").str.replace("\.+", ".").str.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s").values))
dataset = dataset.str.replace("[^\w\s\d]", "").str.strip().str.lower()
dataset = dataset[dataset.apply(lambda row: len(row.split())) > minimo_palavras_frase].reset_index(drop=True)
palavras = np.concatenate(dataset.apply(lambda row: row.split()))

minimo_palavras_dicionario = 2
dicionario, dicionario_indices, dicionario_freq = criar_dicionario(palavras, minimo_palavras_dicionario)
dataset = dataset.apply(lambda row: [dicionario.get(palavra, len(dicionario)-1) for palavra in row.split()])
tamanho_dicionario = len(dicionario)
tamanho_dataset = len(dataset)

In [248]:
def criar_modelo(dim_entrada, dim_embedding, dim_lstm):
    entrada = tf.keras.layers.Input((None,), name="entrada")
    camada_embedding = tf.keras.layers.Embedding(dim_entrada, dim_embedding, name='embedding')
    camada_contexto = camada_embedding(entrada)
    camada_lstm = tf.keras.layers.LSTM(dim_lstm, name="lstm")(camada_contexto)
    camada_saida = tf.keras.layers.Dense(dim_entrada,name="saida", activation="softmax")(camada_lstm)
    
    modelo = tf.keras.models.Model(inputs=entrada, outputs=camada_saida)
#     modelo.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["sparse_categorical_accuracy", "sparse_top_k_categorical_accuracy"])
    modelo.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["accuracy"])
    
    return modelo

    
modelo = criar_modelo(dim_entrada=tamanho_dicionario, dim_embedding=EMBEDDING_UNITS, dim_lstm=LSTM_UNITS)
modelo.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
entrada (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         3635968   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
saida (Dense)                (None, 28406)             3664374   
Total params: 7,431,926
Trainable params: 7,431,926
Non-trainable params: 0
_________________________________________________________________


In [259]:
def criar_gerador(dataset, tamanho_dicionario):
    for d in dataset:
        yield np.array(d[:-1]), np.array(d[1:])
        

gerador = criar_gerador(dataset, tamanho_dicionario)
modelo.fit_generator(gerador, epochs=2, steps_per_epoch=tamanho_dataset-1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fe650086240>

In [261]:
frase = "o rei é um pouco"
vetor = [[dicionario[palavra] for palavra in frase.split()], [1, 2, 3, 4], [1, 1]]
a = modelo.predict(vetor)

[dicionario_indices[np.argmax(b)] for b in a]
# dicionario_indices[np.argmax(a)]

['que', 'e', 'que', 'pouco', 'e']

In [160]:
dicionario_indices[np.argmax(a[2])]

'UNK'

In [217]:
class Model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Model, self).__init__()
        self.units = units
        self.batch_size = batch_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_activation='sigmoid',
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size) 
    def call(self, inputs, hidden):
        inputs = self.embedding(inputs)
        output, states = self.gru(inputs, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, states

BATCH_SIZE = 100
modelo = Model(len(dicionario), EMBEDDING_UNITS, LSTM_UNITS, BATCH_SIZE)
# modelo.build((10,))
# modelo.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["acc"])
# modelo, modelo_embedding = criar_modelo(len(dicionario), EMBEDDINGS)
# modelo_embedding.summary()
optimizer = tf.train.AdamOptimizer()
def loss_function(labels, logits):
    return tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

X, Y = [], []
for d in dataset:
    for i in range(1, len(d)):
        X.append(d[i-1])
        Y.append(d[i])

X, Y = np.array(X).reshape((-1, 1)), np.array(Y).reshape(-1, 1)
ds = tf.data.Dataset.from_tensor_slices((X, Y))
ds = ds.batch(BATCH_SIZE, drop_remainder=True)


for epoch in range(1):
    hidden = modelo.reset_states()
    for (batch, (input, target)) in enumerate(ds):
        with tf.GradientTape() as tape:
            predictions, hidden = modelo(input, hidden)
            target = tf.reshape(target, (-1,))
            loss = loss_function(target, predictions)
            grads = tape.gradient(loss, modelo.variables)
            optimizer.apply_gradients(zip(grads, modelo.variables))
            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss{:.4f}'.format(epoch + 1, batch, loss))

RuntimeError: dataset.__iter__() is only supported when eager execution is enabled.

In [8]:
modelo.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
entrada (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding (Embedding)        (None, None, 64)          1817984   
_________________________________________________________________
lstm (LSTM)                  [(None, 128), (None, 128) 98816     
_________________________________________________________________
saida (Dense)                (None, 1)                 129       
Total params: 1,916,929
Trainable params: 1,916,929
Non-trainable params: 0
_________________________________________________________________


In [72]:
def criar_gerador(dataset, tamanho_dicionario):
    batchX = []
    batchY = []
    for idx, d in enumerate(dataset):
        positivo = np.array(d)
        negativo = np.array(d)
        batchX.append(positivo)
        batchY.append(1)
        np.random.shuffle(negativo)
        batchX.append(negativo)
        batchY.append(0)
    ordem = np.arange(len(batchY))
    np.random.shuffle(ordem)
    batchX = np.array(batchX)[ordem]
    batchY = np.array(batchY)[ordem].reshape((-1, 1))
    for x, y in zip(batchX, batchY):
        yield (x.reshape(1, -1), y)


gerador = criar_gerador(dataset, tamanho_dicionario)
%timeit -n2 -r2 modelo.fit_generator(gerador, epochs=1, steps_per_epoch=100, use_multiprocessing=True)

7.26 s ± 172 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)


In [63]:
# modelo.fit(np.array([[X[:1]]]), Y[:1])
# X, Y.reshape((-1,))
# X, Y = np.array(X).reshape((-1, 1)), np.array(Y).reshape(-1, 1)

# ds = tf.data.Dataset.from_tensor_slices((X, Y))
# ds = ds.batch(BATCH_SIZE, drop_remainder=True)
# X.shape
# Y.shape
bX = [[np.random.randint(0, 10, 10)], [np.random.randint(0, 10, 5)]]
bY = [0, 1]
modelo.fit(bX, bY)


ValueError: Input arrays should have the same number of samples as target arrays. Found 1 input samples and 2 target samples.