# LSTM com Embeddings (Feat. Machado de Assis)

In [1]:
import collections
from typing import List, Set, Dict, Tuple, Generator

import numpy as np
import pandas as pd
import scipy
import tensorflow as tf

CAMINHO_MODELO = "modelo.json"
CAMINHO_DICIONARIO = "dicionario.json"
CAMINHO_DICIONARIO_INDICES = "dicionario_indices.json"
EMBEDDING_UNITS = 64
LSTM_UNITS = 128

In [79]:
def criar_dicionario(palavras: List[str], minimo_palavras: int) -> (Dict[str, int], Dict[int, str], Dict[str, int]):
    dicionario_freq = collections.Counter(palavras)
    palavras_no_dicionario = [palavra for palavra, qtd in dicionario_freq.items() if qtd > minimo_palavras]
    dicionario = {palavra: indice for (indice, palavra) in enumerate(palavras_no_dicionario)}
    dicionario["UNK"] = len(dicionario)
    dicionario_indices = dict(zip(dicionario.values(), dicionario.keys()))
    return dicionario, dicionario_indices, dicionario_freq
df_obras = pd.read_csv("./obras_machado_de_assis.csv")
df_obras = df_obras[df_obras["categoria"] != "tradução"]

minimo_palavras_frase = 2
dataset = pd.Series(np.concatenate(df_obras["texto"].str.replace("\n+", " ").str.replace("\.+", ".").str.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s").values))
dataset = dataset.str.replace("[^\w\s\d]", "").str.strip().str.lower()
dataset = dataset[dataset.apply(lambda row: len(row.split())) > minimo_palavras_frase].reset_index(drop=True)
palavras = np.concatenate(dataset.apply(lambda row: row.split()))

minimo_palavras_dicionario = 2
dicionario, dicionario_indices, dicionario_freq = criar_dicionario(palavras, minimo_palavras_dicionario)
dataset = dataset.apply(lambda row: [dicionario.get(palavra, len(dicionario)-1) for palavra in row.split()])
tamanho_dicionario = len(dicionario)
tamanho_dataset = len(dataset)

In [80]:
print("Frases no dataset: {0}\nTotal de palavras: {1}\nPalavras no dicionário: {2}\nMédia de palavras por frase: {3}".format(len(dataset), len(palavras), len(dicionario), round(len(palavras)/len(dataset), 1)))

Frases no dataset: 98366
Total de palavras: 1698151
Palavras no dicionário: 28406
Média de palavras por frase: 17.3


In [81]:
dataset.head()

0                                            [0, 1, 2]
1                                         [3, 4, 5, 6]
2              [7, 8, 9, 10, 2, 11, 12, 13, 12, 14, 2]
3    [3, 15, 16, 17, 15, 18, 19, 20, 2, 21, 22, 23,...
4    [49, 50, 51, 52, 53, 54, 35, 29, 22, 55, 2, 56...
dtype: object

In [69]:
def criar_modelo(dim_entrada, dim_embedding, dim_lstm):
    entrada = tf.keras.layers.Input((None,), name="entrada")
    camada_embedding = tf.keras.layers.Embedding(dim_entrada, dim_embedding, name='embedding')
    camada_contexto = camada_embedding(entrada)
    lstm, estado_h, estado_c = tf.keras.layers.LSTM(dim_lstm, name="lstm", return_states=True)(camada_contexto)
    camada_saida = tf.keras.layers.Dense(1,name="saida", activation="sigmoid")(lstm)
    
    modelo = tf.keras.models.Model(inputs=entrada, outputs=camada_saida)
    modelo.compile(loss='binary_crossentropy', optimizer='adam', metrics=["acc"])
    
    modelo_estado = tf.keras.models.Model(inputs=entrada, outputs=[estado_h, estado_c])
    return modelo

def criar_modelo_preditivo(dim_entrada, modelo_estado):
    entrada_preditivo = tf.keras.layers.Input((1,), name="entrada_preditivo")

# modelo = criar_modelo(dim_entrada=tamanho_dicionario, dim_embedding=EMBEDDING_UNITS, dim_lstm=LSTM_UNITS)
# modelo.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
entrada (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding (Embedding)        (None, None, 64)          1817984   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               98816     
_________________________________________________________________
saida (Dense)                (None, 1)                 129       
Total params: 1,916,929
Trainable params: 1,916,929
Non-trainable params: 0
_________________________________________________________________


In [None]:
def criar_gerador(dataset, tamanho_dicionario):
    batchX = []
    batchY = []
    for idx, d in enumerate(dataset):
        positivo = np.array(d)
        negativo = np.array(d)
        batchX.append(positivo)
        batchY.append(1)
        np.random.shuffle(negativo)
        batchX.append(negativo)
        batchY.append(0)
    ordem = np.arange(len(batchY))
    np.random.shuffle(ordem)
    batchX = np.array(batchX)[ordem]
    batchY = np.array(batchY)[ordem].reshape((-1, 1))
    for x, y in zip(batchX, batchY):
        yield (x.reshape(1, -1), y)


gerador = criar_gerador(dataset, tamanho_dicionario)
modelo.fit_generator(gerador, epochs=1, steps_per_epoch=2*tamanho_dataset)



In [86]:
modelo.evaluate_generator(gerador, steps=1000)

[0.014203823552459654, 0.997]

In [94]:
frase = "Eu gostaria de comprar um".lower()
probs = []
for palavra, indice in dicionario.items():
    frase2 = frase + " " + palavra
    vetor = np.array([[dicionario.get(palavra, tamanho_dicionario-1) for palavra in frase2.split()]])
    probs.append((palavra, modelo.predict(vetor)))
print(probs[:10])

[('poesias', array([[0.9998054]], dtype=float32)), ('de', array([[0.9995316]], dtype=float32)), ('a', array([[0.99953747]], dtype=float32)), ('zaluar', array([[0.99983716]], dtype=float32)), ('garnier', array([[0.9998442]], dtype=float32)), ('editor', array([[0.9997458]], dtype=float32)), ('1863', array([[0.9999052]], dtype=float32)), ('dois', array([[0.99989104]], dtype=float32)), ('motivos', array([[0.99992335]], dtype=float32)), ('me', array([[0.9995493]], dtype=float32))]


In [98]:
sorted(probs, key=lambda x: -x[1][0])[:10]

[('d', array([[0.9999726]], dtype=float32)),
 ('mundo', array([[0.9999697]], dtype=float32)),
 ('terra', array([[0.99996877]], dtype=float32)),
 ('coisa', array([[0.99996865]], dtype=float32)),
 ('casa', array([[0.9999685]], dtype=float32)),
 ('mãe', array([[0.99996805]], dtype=float32)),
 ('futuro', array([[0.9999678]], dtype=float32)),
 ('quê', array([[0.9999677]], dtype=float32)),
 ('tarde', array([[0.999967]], dtype=float32)),
 ('homens', array([[0.99996686]], dtype=float32))]

In [None]:
for epoch in range(10):
    hidden = modelo.reset_states()
    for (batch, (input, target)) in enumerate(ds):
        with tf.GradientTape() as tape:
            predictions, hidden = modelo(input, hidden)
            target = tf.reshape(target, (-1,))
            loss = loss_function(target, predictions)
            grads = tape.gradient(loss, modelo.variables)
            optimizer.apply_gradients(zip(grads, modelo.variables))
            if batch % 1000 == 0:
                print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, loss))

In [None]:
start_string = "Eduardo viajou para o que"

input_eval = [dicionario.get(string) for string in start_string.lower().split()]
input_eval = tf.expand_dims(input_eval, 0)

text_generated = ''

hidden = [tf.zeros((1, LSTM_UNITS))]

predictions, hidden = modelo(input_eval, hidden)

predicted_id = tf.argmax(predictions[-1]).numpy()

text_generated += " " + dicionario_indices[predicted_id]

print(start_string + text_generated)

In [None]:
def criar_gerador(dataset, dicionario):
    for frase in dataset:
        
#         yield (np.array([frase[:-1]]), np.array([frase[-1]]))
        
next(criar_gerador(dataset, dicionario))

# gerador = criar_gerador(dataset, dicionario, modelo_embedding)
# modelo.fit_generator(gerador, epochs=1, steps_per_epoch=len(dataset)-1)

In [None]:
embeddings = modelo_embedding.predict(np.arange(len(dicionario)))
embeddings

In [None]:
frase = "a carne mais barata do mercado é a carne tendo".lower()
entrada = [dicionario.get(palavra, len(dicionario)-1) for palavra in frase.split()]
vec = modelo.predict(entrada)
[dicionario_indices[indice] for indice in distancia_coseno(embeddings, vec).argmin(axis=0)]

In [None]:
[dicionario_indices[indice] for indice in (np.dot(vec, embeddings.T)/vec.sum(axis=1).reshape((-1, 1))/embeddings.sum(axis=1)).argmin(axis=1)]

In [None]:
def distancia_coseno(embeddings, vetores):
    return (np.dot(embeddings, vec.T)/(vec**2).sum(axis=1)**0.5)/((embeddings**2).sum(axis=1)**0.5).reshape((-1, 1))
# np.dot(vec, embeddings.T)/vec.sum(axis=1).reshape((-1, 1))/embeddings.sum(axis=1)
x
#  (vec**2).sum(axis=1)/embeddings.sum(axis=1).reshape((-1, 1))) 

In [None]:
# np.argmax((embeddings - vec).sum(axis=0))
np.array([embeddings - v for v in vec]).sum(axis=2).argmax(axis=1)

In [None]:
dicionario_indices[0]

In [None]:
class Model(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units, batch_size):
        super(Model, self).__init__()
        self.units = units
        self.batch_size = batch_size
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_activation='sigmoid',
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size) 
    def call(self, inputs, hidden):
        inputs = self.embedding(inputs)
        output, states = self.gru(inputs, initial_state=hidden)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, states

BATCH_SIZE = 100
modelo = Model(len(dicionario), EMBEDDINGS, LSTM_UNITS, BATCH_SIZE)
# modelo.build((10,))
# modelo.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["acc"])
# modelo, modelo_embedding = criar_modelo(len(dicionario), EMBEDDINGS)
# modelo_embedding.summary()
optimizer = tf.train.AdamOptimizer()
def loss_function(labels, logits):
    return tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

X, Y = [], []
for d in dataset:
    for i in range(1, len(d)):
        X.append(d[i-1])
        Y.append(d[i])

X, Y = np.array(X).reshape((-1, 1)), np.array(Y).reshape(-1, 1)
ds = tf.data.Dataset.from_tensor_slices((X, Y))
ds = ds.batch(BATCH_SIZE, drop_remainder=True)


for epoch in range(1):
    hidden = modelo.reset_states()
    for (batch, (input, target)) in enumerate(ds):
        with tf.GradientTape() as tape:
            predictions, hidden = modelo(input, hidden)
            target = tf.reshape(target, (-1,))
            loss = loss_function(target, predictions)
            grads = tape.gradient(loss, modelo.variables)
            optimizer.apply_gradients(zip(grads, modelo.variables))
            if batch % 100 == 0:
                print('Epoch {} Batch {} Loss{:.4f}'.format(epoch + 1, batch, loss))

In [None]:
sorted(get_preds("não"), key=lambda x: x[1])

In [None]:
get_preds("quatro")