In [1]:
# Instalación de dependencias necesarias
!pip install --upgrade --no-cache-dir gdown --quiet

# Importar librerías necesarias
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import one_hot
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, SimpleRNN
from keras.models import Model
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
import os
import gdown
import json

# Descargar la carpeta de dataset
if os.access('data_volunteers.json', os.F_OK) is False:
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    output = 'data_volunteers.json'
    gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")

# Cargar el dataset
text_file = "data_volunteers.json"
with open(text_file) as f:
    data = json.load(f) # la variable data será un diccionario

# Observar los campos disponibles en cada línea del dataset
data[0].keys()

chat_in = []
chat_out = []

input_sentences = []
output_sentences = []
output_sentences_inputs = []
max_len = 30

def clean_text(txt):
    txt = txt.lower()
    txt.replace("\'d", " had")
    txt.replace("\'s", " is")
    txt.replace("\'m", " am")
    txt.replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)

    return txt

for line in data:
    for i in range(len(line['dialog'])-1):
        # Vamos separando el texto en "preguntas" (chat_in)
        # y "respuestas" (chat_out)
        chat_in = clean_text(line['dialog'][i]['text'])
        chat_out = clean_text(line['dialog'][i+1]['text'])

        if len(chat_in) >= max_len or len(chat_out) >= max_len:
            continue

        input_sentence, output = chat_in, chat_out

        # output sentence (decoder_output) tiene
        output_sentence = output + ' '
        # output sentence input (decoder_input) tiene
        output_sentence_input = ' ' + output

        input_sentences.append(input_sentence)
        output_sentences.append(output_sentence)
        output_sentences_inputs.append(output_sentence_input)

print("Cantidad de rows utilizadas:", len(input_sentences))

# Tokenizar las oraciones de entrada
tokenizer_inputs = Tokenizer()
tokenizer_inputs.fit_on_texts(input_sentences)
input_sequences = tokenizer_inputs.texts_to_sequences(input_sentences)

# Encontrar la longitud máxima de las oraciones de entrada
max_input_len = max(len(seq) for seq in input_sequences)

# Crear el diccionario de palabras para las oraciones de entrada
word2idx_inputs = tokenizer_inputs.word_index
num_words_input = len(word2idx_inputs) + 1

# Tokenizar las oraciones de salida
tokenizer_outputs = Tokenizer()
tokenizer_outputs.fit_on_texts(output_sentences + output_sentences_inputs)
output_sequences = tokenizer_outputs.texts_to_sequences(output_sentences)
output_sequences_inputs = tokenizer_outputs.texts_to_sequences(output_sentences_inputs)

# Encontrar la longitud máxima de las oraciones de salida
max_output_len = max(len(seq) for seq in output_sequences)

# Crear el diccionario de palabras para las oraciones de salida
word2idx_outputs = tokenizer_outputs.word_index
num_words_output = len(word2idx_outputs) + 1

# Padding de las secuencias para tener una longitud fija
encoder_input_sequences = pad_sequences(input_sequences, maxlen=max_input_len)
decoder_input_sequences = pad_sequences(output_sequences_inputs, maxlen=max_output_len)
decoder_output_sequences = pad_sequences(output_sequences, maxlen=max_output_len)

print(f'Max Input Length: {max_input_len}')
print(f'Number of Input Words: {num_words_input}')
print(f'Max Output Length: {max_output_len}')
print(f'Number of Output Words: {num_words_output}')

# Descargar los embeddings de GloVe
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

# Cargar los embeddings de GloVe en un diccionario
embeddings_index = {}
with open('glove.6B.100d.txt', 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coeffs

# Crear la matriz de embeddings para las palabras en el diccionario de entrada
embedding_dim = 100
embedding_matrix = np.zeros((num_words_input, embedding_dim))
for word, idx in word2idx_inputs.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

# Definir el modelo
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_input_len,))
embedding_layer = Embedding(num_words_input, embedding_dim, weights=[embedding_matrix], input_length=max_input_len, trainable=False)
encoder_embeddings = embedding_layer(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embeddings)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_output_len,))
decoder_embedding_layer = Embedding(num_words_output, embedding_dim)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embeddings, initial_state=encoder_states)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Modelo
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Convertir las secuencias de salida a un formato adecuado
decoder_targets = np.expand_dims(decoder_output_sequences, -1)

# Entrenar el modelo
model.fit([encoder_input_sequences, decoder_input_sequences], decoder_targets, batch_size=64, epochs=100, validation_split=0.2)

# Modelo de Inferencia del Encoder
encoder_model = Model(encoder_inputs, encoder_states)

# Modelo de Inferencia del Decoder
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_embeddings2 = decoder_embedding_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_embeddings2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

# Función para generar respuestas
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['<start>']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tokenizer_outputs.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_token

        if sampled_token == '<end>' or len(decoded_sentence) > max_output_len:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence

# Probar el modelo con una secuencia de entrada
test_input_seq = encoder_input_sequences[0:1]
decoded_sentence = decode_sequence(test_input_seq)
print('Input:', input_sentences[0])
print('Decoded:', decoded_sentence)


Downloading...
From: https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download
To: /content/data_volunteers.json
100%|██████████| 2.58M/2.58M [00:00<00:00, 17.2MB/s]


Cantidad de rows utilizadas: 6033
Max Input Length: 9
Number of Input Words: 1800
Max Output Length: 9
Number of Output Words: 1805
--2024-06-30 22:43:13--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-06-30 22:43:13--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-06-30 22:43:13--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... 

KeyError: '<start>'

In [2]:
# Función para generar respuestas
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['start']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tokenizer_outputs.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_token

        if sampled_token == 'end' or len(decoded_sentence) > max_output_len:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence

# Probar el modelo con una secuencia de entrada
test_input_seq = encoder_input_sequences[0:1]
decoded_sentence = decode_sequence(test_input_seq)
print('Input:', input_sentences[0])
print('Decoded:', decoded_sentence)

Input: hello 
Decoded:  by by by by


In [3]:
# Instalación de dependencias necesarias
!pip install --upgrade --no-cache-dir gdown --quiet
!pip install wget --quiet

# Importar librerías necesarias
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import one_hot
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, SimpleRNN
from keras.models import Model
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.layers import Input
import os
import gdown
import json
import wget

# Descargar la carpeta de dataset
if os.access('data_volunteers.json', os.F_OK) is False:
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    output = 'data_volunteers.json'
    gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")

# Cargar el dataset
text_file = "data_volunteers.json"
with open(text_file) as f:
    data = json.load(f) # la variable data será un diccionario

# Observar los campos disponibles en cada línea del dataset
data[0].keys()

chat_in = []
chat_out = []

input_sentences = []
output_sentences = []
output_sentences_inputs = []
max_len = 10  # Cambiamos max_len a 10 según las recomendaciones

def clean_text(txt):
    txt = txt.lower()
    txt = txt.replace("\'d", " had")
    txt = txt.replace("\'s", " is")
    txt = txt.replace("\'m", " am")
    txt = txt.replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)

    return txt

for line in data:
    for i in range(len(line['dialog'])-1):
        # Vamos separando el texto en "preguntas" (chat_in)
        # y "respuestas" (chat_out)
        chat_in = clean_text(line['dialog'][i]['text'])
        chat_out = clean_text(line['dialog'][i+1]['text'])

        if len(chat_in) >= max_len or len(chat_out) >= max_len:
            continue

        input_sentence, output = chat_in, chat_out

        # output sentence (decoder_output) tiene
        output_sentence = '<start> ' + output + ' <end>'
        # output sentence input (decoder_input) tiene
        output_sentence_input = '<start> ' + output

        input_sentences.append(input_sentence)
        output_sentences.append(output_sentence)
        output_sentences_inputs.append(output_sentence_input)

print("Cantidad de rows utilizadas:", len(input_sentences))

# Tokenizar las oraciones de entrada
tokenizer_inputs = Tokenizer(num_words=8000)  # MAX_VOCAB_SIZE = 8000
tokenizer_inputs.fit_on_texts(input_sentences)
input_sequences = tokenizer_inputs.texts_to_sequences(input_sentences)

# Encontrar la longitud máxima de las oraciones de entrada
max_input_len = max(len(seq) for seq in input_sequences)

# Crear el diccionario de palabras para las oraciones de entrada
word2idx_inputs = tokenizer_inputs.word_index
num_words_input = min(8000, len(word2idx_inputs) + 1)

# Tokenizar las oraciones de salida
tokenizer_outputs = Tokenizer(num_words=8000, filters='')  # MAX_VOCAB_SIZE = 8000
tokenizer_outputs.fit_on_texts(output_sentences + output_sentences_inputs)
output_sequences = tokenizer_outputs.texts_to_sequences(output_sentences)
output_sequences_inputs = tokenizer_outputs.texts_to_sequences(output_sentences_inputs)

# Encontrar la longitud máxima de las oraciones de salida
max_output_len = max(len(seq) for seq in output_sequences)

# Crear el diccionario de palabras para las oraciones de salida
word2idx_outputs = tokenizer_outputs.word_index
num_words_output = min(8000, len(word2idx_outputs) + 1)

# Padding de las secuencias para tener una longitud fija
encoder_input_sequences = pad_sequences(input_sequences, maxlen=max_input_len)
decoder_input_sequences = pad_sequences(output_sequences_inputs, maxlen=max_output_len)
decoder_output_sequences = pad_sequences(output_sequences, maxlen=max_output_len)

print(f'Max Input Length: {max_input_len}')
print(f'Number of Input Words: {num_words_input}')
print(f'Max Output Length: {max_output_len}')
print(f'Number of Output Words: {num_words_output}')

# Descargar los embeddings de FastText
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
!gunzip cc.en.300.vec.gz

# Cargar los embeddings de FastText en un diccionario
embeddings_index = {}
with open('cc.en.300.vec', 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coeffs

# Crear la matriz de embeddings para las palabras en el diccionario de entrada
embedding_dim = 300  # Embeddings 300 Fasttext
embedding_matrix = np.zeros((num_words_input, embedding_dim))
for word, idx in word2idx_inputs.items():
    if idx < num_words_input:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

# Definir el modelo
latent_dim = 128  # n_units = 128

# Encoder
encoder_inputs = Input(shape=(max_input_len,))
embedding_layer = Embedding(num_words_input, embedding_dim, weights=[embedding_matrix], input_length=max_input_len, trainable=False)
encoder_embeddings = embedding_layer(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True, dropout=0.2)  # LSTM Dropout 0.2
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embeddings)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_output_len,))
decoder_embedding_layer = Embedding(num_words_output, embedding_dim)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.2)  # LSTM Dropout 0.2
decoder_outputs, _, _ = decoder_lstm(decoder_embeddings, initial_state=encoder_states)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Modelo
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Convertir las secuencias de salida a un formato adecuado
decoder_targets = np.expand_dims(decoder_output_sequences, -1)

# Entrenar el modelo
model.fit([encoder_input_sequences, decoder_input_sequences], decoder_targets, batch_size=64, epochs=30, validation_split=0.2)  # Epochs 30~50

# Modelo de Inferencia del Encoder
encoder_model = Model(encoder_inputs, encoder_states)

# Modelo de Inferencia del Decoder
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_embeddings2 = decoder_embedding_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_embeddings2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

# Función para generar respuestas
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = word2idx_outputs['start']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tokenizer_outputs.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_token

        if sampled_token == 'end' or len(decoded_sentence) > max_output_len:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence

# Preguntas de prueba
test_questions = [
    "Do you read?",
    "Do you have any pet?",
    "Where are you from?"
]

# Generar respuestas para las preguntas de prueba
for question in test_questions:
    input_seq = tokenizer_inputs.texts_to_sequences([question])
    input_seq = pad_sequences(input_seq, maxlen=max_input_len)
    decoded_sentence = decode_sequence(input_seq)
    print(f'Question: {question}')
    print(f'Bot Response: {decoded_sentence}')


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wget (setup.py) ... [?25l[?25hdone
El dataset ya se encuentra descargado
Cantidad de rows utilizadas: 388
Max Input Length: 4
Number of Input Words: 142
Max Output Length: 5
Number of Output Words: 158
--2024-06-30 23:25:54--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.25, 13.226.210.78, 13.226.210.111, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘cc.en.300.vec.gz’


2024-06-30 23:26:11 (73.5 MB/s) - ‘cc.en.300.vec.gz’ saved [1325960915/1325960915]

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Ep

KeyError: 0