In [1]:
from model import Chatbot, Encoder, Decoder
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from pathlib import Path
from unidecode import unidecode
import numpy as np
import pickle

In [2]:
data_dir = Path('data')

with open(data_dir/'preguntas.txt', 'r', encoding='utf-8') as f:
    questions = f.readlines()

with open(data_dir/'respuestas.txt', 'r', encoding='utf-8') as f:
    answers = f.readlines()

print(f"El archivo 'preguntas.txt' contiene {len(questions)} preguntas")

El archivo 'preguntas.txt' contiene 273 preguntas


In [3]:
START_TOKEN = '<START> '
END_TOKEN = ' <END>'

processed_questions = list(map(lambda x: unidecode(x.lower()), questions))
processed_answers = list(map(lambda x: START_TOKEN + unidecode(x.lower().strip()) + END_TOKEN, answers))

print(processed_answers[0])

<START> "rosebud" <END>


In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(processed_questions + processed_answers)
vocab_size = len(tokenizer.word_index)

print(f"There are {vocab_size} unique words")

There are 1725 unique words


In [5]:
# Preparing Encoder Input
tokenized_questions = tokenizer.texts_to_sequences(processed_questions)
max_input_length = max(list(map(len, tokenizer.texts_to_sequences(processed_questions))))
encoder_input = np.array(pad_sequences(tokenized_questions, maxlen=max_input_length, padding='post'))
print(f"Encoder input data shape: {encoder_input.shape}")

# Preparing Decoder Input
tokenized_answers = tokenizer.texts_to_sequences(processed_answers)
max_output_length = max(list(map(len, tokenizer.texts_to_sequences(processed_answers))))
decoder_input = np.array(pad_sequences(tokenized_answers, maxlen=max_input_length, padding='post'))
print(f"Decoder input data shape: {decoder_input.shape}")

# Preparing Target Output
tokenized_output = list(map(lambda x: x[1:], tokenized_answers))
padded_target = pad_sequences(tokenized_output, maxlen=max_input_length, padding='post')
decoder_target = np.array(to_categorical(padded_target))
print(f"Decoder target data shape: {decoder_target.shape}")

with open(data_dir/'tokenizer.pkl', 'wb') as f:
    pickle.dump({
        'tokenizer': tokenizer,
        'vocab_size': vocab_size,
        'max_input_length': max_input_length,
        'max_output_length': max_output_length,
        'model_bootstrap': (encoder_input[:1], decoder_input[:1], decoder_target[:1])   
    }, f, protocol=pickle.DEFAULT_PROTOCOL)

Encoder input data shape: (273, 31)
Decoder input data shape: (273, 31)
Decoder target data shape: (273, 31, 1726)


In [6]:
model = Chatbot(vocab_size + 1)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit([encoder_input, decoder_input], decoder_target, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f604e7bae10>

In [7]:
model.save_weights(Path('model')/'model.hdf5')

In [8]:
def preprocess_input(text: str):
    text = text.lower()
    text = unidecode(text)
    tokenized_text = tokenizer.texts_to_sequences([text])
    padded_text = pad_sequences(tokenized_text, maxlen=max_input_length, padding='post')

    return padded_text

In [9]:
#model = Chatbot(vocab_size + 1)
#model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#model.train_on_batch([encoder_input[:1], decoder_input[:1]], decoder_target[:1])
#model.load_weights(Path('model')/'model.hdf5')

In [19]:
question = 'Qué pelicula de Alfred Hitchcock se rodo como si fuera una toma continua'
empty_target_seq = np.array([[tokenizer.word_index['start']]])
stop_condition = False
answer = ''
stop_words = ['adios', 'gracias']

proc_question = preprocess_input(question)
encoder_input = model.embedding(proc_question)
h, c = model.encoder.predict([encoder_input])

while not stop_condition:
    decoder_input = model.embedding(empty_target_seq)
    decoder_output, h, c = model.decoder.predict([decoder_input, h, c])
    sampled_word_index = np.argmax( decoder_output[0, -1, :] )
    sampled_word = tokenizer.index_word.get(sampled_word_index, None)
    if sampled_word == 'end' or len(answer.split()) > max_output_length:
        stop_condition = True
    else:
        answer += f"{sampled_word} "
    

    empty_target_seq = np.array([np.append(empty_target_seq[0],sampled_word_index)])
    enc_stats = [h, c]


print(answer)


la soga 
