<img src="https://github.com/hernancontigiani/ceia_memorias_especializacion/raw/master/Figures/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural I

# LSTM Bot QA

### Datos

### El objecto es utilizar datos disponibles del challenge ConvAI2 (Conversational Intelligence Challenge 2) de conversaciones en inglés. Se construirá un BOT para responder a preguntas del usuario (QA).

In [63]:
!pip install --upgrade --no-cache-dir gdown --quiet


[notice] A new release of pip available: 22.3 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


#### 2 - Preprocesamiento
Realizar el preprocesamiento necesario para obtener:

word2idx_inputs, max_input_len
word2idx_outputs, max_out_len, num_words_output
encoder_input_sequences, decoder_output_sequences, decoder_targets

### IMPORTS

In [64]:
import numpy as np
import pandas as pd
import os, io, json, gzip, zipfile, gdown, heapq, re, unicodedata
import torch
import torch.nn as nn
import tensorflow as tf
import itertools
from collections import Counter
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.utils import pad_sequences, register_keras_serializable
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Activation, Dropout, Dense, LSTM, Input, Concatenate, Embedding, Attention, Bidirectional, LayerNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences

### DATASET

In [65]:
# Descargar la carpeta de dataset
if os.access('data_volunteers.json', os.F_OK) is False:
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    output = 'data_volunteers.json'
    gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")

El dataset ya se encuentra descargado


In [None]:
text_file = "data_volunteers.json"
with open(text_file) as f:
    data = json.load(f)

In [67]:
# Observar los campos disponibles en cada linea del dataset
data[0].keys()

dict_keys(['dialog', 'start_time', 'end_time', 'bot_profile', 'user_profile', 'eval_score', 'profile_match', 'participant1_id', 'participant2_id'])

### 2 - Preprocesamiento
#### Realizar el preprocesamiento necesario para obtener:

#### word2idx_inputs, max_input_len
#### word2idx_outputs, max_out_len, num_words_output
#### encoder_input_sequences, decoder_output_sequences, decoder_targets

In [68]:
chat_in = []
chat_out = []

input_sentences = []
output_sentences = []
output_sentences_inputs = []
max_len = 30

def clean_text(txt):
    txt = txt.lower()    
    txt.replace("\'d", " had")
    txt.replace("\'s", " is")
    txt.replace("\'m", " am")
    txt.replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)
    
    return txt

for line in data:
    for i in range(len(line['dialog'])-1):
        # vamos separando el texto en "preguntas" (chat_in)
        # y "respuestas" (chat_out)
        chat_in = clean_text(line['dialog'][i]['text'])
        chat_out = clean_text(line['dialog'][i+1]['text'])

        if len(chat_in) >= max_len or len(chat_out) >= max_len:
            continue

        input_sentence, output = chat_in, chat_out
        
        # output sentence (decoder_output) tiene 
        output_sentence = output + ' '
        # output sentence input (decoder_input) tiene 
        output_sentence_input = ' ' + output

        input_sentences.append(input_sentence)
        output_sentences.append(output_sentence)
        output_sentences_inputs.append(output_sentence_input)

print("Cantidad de rows utilizadas:", len(input_sentences))

Cantidad de rows utilizadas: 6033


In [69]:
input_sentences[1], output_sentences[1], output_sentences_inputs[1]

('hi how are you ', 'not bad and you  ', ' not bad and you ')

In [None]:
# normalización básica
def normalize_text(s: str) -> str:
    s = unicodedata.normalize("NFKC", s).lower()
    # espacios alrededor de puntuación común
    s = re.sub(r"([?.!,;:()\"'])", r" \1 ", s)
    s = re.sub(r"\s{2,}", " ", s).strip()
    return s

# tokenización simple por espacios (tras normalizar)
def tokenize(s: str):
    return normalize_text(s).split()

# Extraer pares Human -> Bot (pregunta/respuesta)
pairs = []
for conv in data:
    dialog = conv.get("dialog", [])
    # Recorremos turnos consecutivos Human -> Bot
    for i in range(len(dialog)-1):
        a, b = dialog[i], dialog[i+1]
        if a.get("sender_class","").lower() == "human" and b.get("sender_class","").lower() == "bot":
            inp = a.get("text","").strip()
            out = b.get("text","").strip()
            if inp and out:
                pairs.append((inp, out))

len(pairs)

6398

In [71]:
# Vista rápida de algunos pares
pairs[:3]

[('Hello!', 'Hi! How are you?'),
 ('Not bad! And You?',
  "I'm doing well. Just got engaged to my high school sweetheart."),
 ('Wowowowow! Congratulations! Is she pretty?',
  "She 's pretty cute. She invited me to dinner tonight. 🙂")]

In [None]:
# Construir vocabularios independientes (inputs/outputs)
SPECIAL_TOKENS = ["<pad>", "<unk>", "<sos>", "<eos>"]
PAD, UNK, SOS, EOS = SPECIAL_TOKENS

tok_inputs = [tokenize(x) for x,_ in pairs]
tok_outputs = [tokenize(y) for _,y in pairs]

# Vocabularios por frecuencia
cnt_in = Counter(itertools.chain.from_iterable(tok_inputs))
cnt_out = Counter(itertools.chain.from_iterable(tok_outputs))

# Orden por frecuencia y luego alfabético para estabilidad
def build_word2idx(counter):
    words = [w for w,_ in sorted(counter.items(), key=lambda kv: (-kv[1], kv[0]))]
    word2idx = {PAD:0, UNK:1, SOS:2, EOS:3}
    for i,w in enumerate(words, start=len(SPECIAL_TOKENS)):
        word2idx[w] = i
    return word2idx

word2idx_inputs = build_word2idx(cnt_in)
word2idx_outputs = build_word2idx(cnt_out)

max_input_len = max(len(t) for t in tok_inputs) if tok_inputs else 0
max_out_len_base = max(len(t) for t in tok_outputs) if tok_outputs else 0
# Para el decoder: añadiremos <sos> y <eos>, por lo que el largo máximo cambia:
max_out_len = max_out_len_base + 2  # incluye <sos> y <eos>

num_words_output = len(word2idx_outputs)

(len(word2idx_inputs), len(word2idx_outputs), max_input_len, max_out_len, num_words_output)

(3295, 1825, 298, 102, 1825)

In [None]:
def encode(tokens, w2i):
    return [w2i.get(tok, w2i["<unk>"]) for tok in tokens]

# --- Secuencias del encoder
encoder_input_sequences = [encode(toks, word2idx_inputs) for toks in tok_inputs]

decoder_output_sequences = [encode(["<sos>"] + toks, word2idx_outputs) for toks in tok_outputs]
decoder_targets = [encode(toks + ["<eos>"], word2idx_outputs) for toks in tok_outputs]

# --- Guardado en disco ---
out_dir = "./desafio4/preproc_convai2"
os.makedirs(out_dir, exist_ok=True)

# Diccionarios
with open(os.path.join(out_dir, "word2idx_inputs.json"), "w", encoding="utf-8") as f:
    json.dump(word2idx_inputs, f, ensure_ascii=False, indent=2)

with open(os.path.join(out_dir, "word2idx_outputs.json"), "w", encoding="utf-8") as f:
    json.dump(word2idx_outputs, f, ensure_ascii=False, indent=2)

# Secuencias
np.savez_compressed(
    os.path.join(out_dir, "sequences.npz"),
    encoder_input_sequences=np.array(encoder_input_sequences, dtype=object),
    decoder_output_sequences=np.array(decoder_output_sequences, dtype=object),
    decoder_targets=np.array(decoder_targets, dtype=object),
    allow_pickle=True
)

# --- Vista previa
preview_rows = 10
df_preview = pd.DataFrame({
    "input_text": [" ".join(t) for t in tok_inputs[:preview_rows]],
    "output_text": [" ".join(t) for t in tok_outputs[:preview_rows]],
    "enc_seq": encoder_input_sequences[:preview_rows],
    "dec_out": decoder_output_sequences[:preview_rows],
    "dec_tgt": decoder_targets[:preview_rows],
})

# Mostrar por pantalla
print(df_preview.head(10))

# Guardar a CSV
df_preview.to_csv(os.path.join(out_dir, "preview_first10.csv"), index=False, encoding="utf-8")

# Pequeño resumen
print("Guardado en:", out_dir)
print("encoder_input_sequences:", len(encoder_input_sequences))
print("decoder_output_sequences:", len(decoder_output_sequences))
print("decoder_targets:", len(decoder_targets))

                                          input_text  \
0                                            hello !   
1                                not bad ! and you ?   
2      wowowowow ! congratulations ! is she pretty ?   
3  cool ! have a good time you both ! and what is...   
4                 me too . and what about iggy pop ?   
5                              hey ? where are you ?   
6                         i ' m playing pipe organ .   
7                                               hi !   
8  cool ! i ' m going to finish with my homework ...   
9                                              bro ?   

                                         output_text  \
0                                 hi ! how are you ?   
1  i ' m doing well . just got engaged to my high...   
2  she ' s pretty cute . she invited me to dinner...   
3             i love music ! i love taylor swift . 😉   
4  i love ziggy ! he is my favorite . are you and...   
5  i am sorry to hear that . what do you do for

In [74]:
# Resumen estadístico
stats = {
    "num_pairs": len(pairs),
    "vocab_inputs": len(word2idx_inputs),
    "vocab_outputs": len(word2idx_outputs),
    "max_input_len_tokens": max_input_len,
    "max_output_len_tokens_including_sos_eos": max_out_len,
    "avg_input_len": float(np.mean([len(t) for t in tok_inputs])),
    "avg_output_len": float(np.mean([len(t) for t in tok_outputs])),
}
stats

{'num_pairs': 6398,
 'vocab_inputs': 3295,
 'vocab_outputs': 1825,
 'max_input_len_tokens': 298,
 'max_output_len_tokens_including_sos_eos': 102,
 'avg_input_len': 5.225382932166302,
 'avg_output_len': 10.611284776492655}

#### 3 - Preparar los embeddings
Utilizar los embeddings de Glove o FastText para transformar los tokens de entrada en vectores

In [None]:
# ========= Config =========
EMB_TYPE = "glove"
EMB_DIM  = 100
EMB_DIR  = "./embeddings"
VOCAB_PATH = "./desafio4/preproc_convai2/word2idx_inputs.json"
SAVE_DIR  = "./desafio4/embeddings"

os.makedirs(EMB_DIR, exist_ok=True)
os.makedirs(SAVE_DIR, exist_ok=True)

GLOVE_URL   = "https://nlp.stanford.edu/data/glove.6B.zip"
FASTTEXT_URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip"

# ========= Utilidades =========
def download(url, dest):
    print(f"Descargando: {url}")
    urllib.request.urlretrieve(url, dest)
    print(f"Guardado en: {dest}")

def ensure_glove(dim=100):
    zip_path = os.path.join(EMB_DIR, "glove.6B.zip")
    target_txt = os.path.join(EMB_DIR, f"glove.6B.{dim}d.txt")
    if not os.path.exists(target_txt):
        if not os.path.exists(zip_path):
            download(GLOVE_URL, zip_path)
        print("Descomprimiendo GloVe...")
        with zipfile.ZipFile(zip_path, "r") as zf:
            zf.extract(f"glove.6B.{dim}d.txt", EMB_DIR)
    return target_txt

def ensure_fasttext_vec():
    zip_path = os.path.join(EMB_DIR, "wiki-news-300d-1M.vec.zip")
    target_vec = os.path.join(EMB_DIR, "wiki-news-300d-1M.vec")
    if not os.path.exists(target_vec):
        if not os.path.exists(zip_path):
            download(FASTTEXT_URL, zip_path)
        print("Descomprimiendo FastText...")
        with zipfile.ZipFile(zip_path, "r") as zf:
            zf.extract("wiki-news-300d-1M.vec", EMB_DIR)
    return target_vec

def open_text_maybe_gzip(path):
    if path.endswith(".gz"):
        return io.TextIOWrapper(gzip.open(path, "rb"), encoding="utf-8", errors="ignore")
    return open(path, "r", encoding="utf-8", errors="ignore")

def maybe_skip_header(line):
    parts = line.rstrip().split()
    if len(parts) == 2:
        try:
            int(parts[0]); int(parts[1])
            return True
        except:
            return False
    return False

# ========= Asegurar archivo de embeddings =========
if EMB_TYPE == "glove":
    emb_path = ensure_glove(EMB_DIM)
elif EMB_TYPE == "fasttext":
    EMB_DIM = 300
    emb_path = ensure_fasttext_vec()
else:
    raise ValueError("EMB_TYPE debe ser 'glove' o 'fasttext'.")

print("Usando embeddings en:", emb_path)

# ========= Cargar vocabulario del encoder =========
with open(VOCAB_PATH, "r", encoding="utf-8") as f:
    word2idx_inputs = json.load(f)

PAD = "<pad>"; UNK = "<unk>"
vocab_size = len(word2idx_inputs)
print(f"Vocab encoder: {vocab_size} palabras")

# ========= Leer embeddings sólo para palabras del vocab =========
embeddings = {}
loaded = 0
with open_text_maybe_gzip(emb_path) as f:
    first = True
    for line in f:
        if first and EMB_TYPE == "fasttext" and maybe_skip_header(line):
            first = False
            continue
        first = False

        parts = line.rstrip().split(" ")
        if len(parts) < EMB_DIM + 1:
            continue
        word = parts[0]
        if word in word2idx_inputs:
            try:
                vec = np.asarray(parts[1:1+EMB_DIM], dtype=np.float32)
                if vec.size == EMB_DIM:
                    embeddings[word] = vec
                    loaded += 1
            except:
                pass

print(f"Embeddings encontrados para {loaded}/{vocab_size} palabras ({loaded/vocab_size:.2%} cobertura)")

# ========= Construir matriz =========
rng = np.random.default_rng(123)
embedding_matrix = rng.normal(0.0, 0.01, size=(vocab_size, EMB_DIM)).astype(np.float32)

# pad -> ceros
if PAD in word2idx_inputs:
    embedding_matrix[word2idx_inputs[PAD]] = np.zeros((EMB_DIM,), dtype=np.float32)

# unk -> media
mean_vec = np.mean(np.stack(list(embeddings.values())), axis=0) if embeddings else np.zeros((EMB_DIM,), dtype=np.float32)
if UNK in word2idx_inputs:
    embedding_matrix[word2idx_inputs[UNK]] = mean_vec

# asignar conocidas
hit = 0
for w, idx in word2idx_inputs.items():
    if w in (PAD, UNK): 
        continue
    vec = embeddings.get(w)
    if vec is not None:
        embedding_matrix[idx] = vec
        hit += 1

print(f"Filas con vector preentrenado: {hit}/{vocab_size} ({hit/vocab_size:.2%})")

# ========= Guardar =========
fname = f"encoder_embedding_matrix_{EMB_TYPE}{EMB_DIM}.npy"
save_path = os.path.join(SAVE_DIR, fname)
np.save(save_path, embedding_matrix)

meta = {
    "type": EMB_TYPE,
    "dim": EMB_DIM,
    "vocab_size": vocab_size,
    "pad_index": word2idx_inputs.get(PAD, None),
    "unk_index": word2idx_inputs.get(UNK, None),
    "coverage_known_tokens": int(hit),
    "coverage_ratio": float(hit / vocab_size)
}
with open(os.path.join(SAVE_DIR, fname.replace(".npy", "_meta.json")), "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

print("Guardado:", save_path)
print("Metadatos:", fname.replace(".npy", "_meta.json"))

Usando embeddings en: ./embeddings\glove.6B.100d.txt
Vocab encoder: 3295 palabras
Embeddings encontrados para 2884/3295 palabras (87.53% cobertura)
Filas con vector preentrenado: 2884/3295 (87.53%)
Guardado: ./desafio4/embeddings\encoder_embedding_matrix_glove100.npy
Metadatos: encoder_embedding_matrix_glove100_meta.json


In [None]:
W = np.load("./desafio4/embeddings/encoder_embedding_matrix_glove100.npy")
emb_layer = Embedding(
    input_dim=W.shape[0],
    output_dim=W.shape[1],
    weights=[W],
    trainable=False,
    mask_zero=True  # asume <pad>=0
)

In [None]:
W = torch.tensor(np.load("./desafio4/embeddings/encoder_embedding_matrix_glove100.npy"), dtype=torch.float32)
emb_layer = nn.Embedding.from_pretrained(W, freeze=True, padding_idx=0)

#### 4 - Entrenar el modelo
Entrenar un modelo basado en el esquema encoder-decoder utilizando los datos generados en los puntos anteriores. Utilce como referencias los ejemplos vistos en clase.

In [78]:
# ---------- Reproducibilidad ----------
K.clear_session()
np.random.seed(42); tf.random.set_seed(42)

# ---------- Rutas ----------
PREPROC_DIR = "./desafio4/preproc_convai2"
EMB_DIR     = "./desafio4/embeddings"
MODEL_DIR   = "./desafio4/models_tied_p99_clean"
os.makedirs(MODEL_DIR, exist_ok=True)

# ---------- Cargar vocab y secuencias ----------
with open(os.path.join(PREPROC_DIR, "word2idx_inputs.json"), "r", encoding="utf-8") as f:
    word2idx_inputs = json.load(f)
with open(os.path.join(PREPROC_DIR, "word2idx_outputs.json"), "r", encoding="utf-8") as f:
    word2idx_outputs = json.load(f)

npz = np.load(os.path.join(PREPROC_DIR, "sequences.npz"), allow_pickle=True)
encoder_input_sequences = list(npz["encoder_input_sequences"])
decoder_input_sequences = list(npz["decoder_output_sequences"])  # <sos> + y
decoder_target_sequences = list(npz["decoder_targets"])          # y + <eos>

# ---------- Longitudes (p99) ----------
lens_in  = [len(s) for s in encoder_input_sequences]
lens_out = [len(s) for s in decoder_input_sequences]
p_in  = max(8, int(np.percentile(lens_in, 99)))
p_out = max(8, int(np.percentile(lens_out, 99)))
print(f"p99 longitudes -> input: {p_in} | output: {p_out}")

# ---------- Padding/Truncado ----------
Xenc = pad_sequences(encoder_input_sequences, maxlen=p_in,  padding="post", truncating="post", value=0)
Xdec = pad_sequences(decoder_input_sequences, maxlen=p_out, padding="post", truncating="post", value=0)
Yidx = pad_sequences(decoder_target_sequences, maxlen=p_out, padding="post", truncating="post", value=0)
Y    = np.expand_dims(Yidx, -1).astype("int32")   # (N,T,1)

# sample_weight: ignora pads
sw = (Yidx != 0).astype("float32")  # (N,T)

# ---------- Vocabularios ----------
num_words_input  = len(word2idx_inputs)
num_words_output = len(word2idx_outputs)

# ---------- Hiperparámetros ----------
EMBED_DIM_ENCODER = 100
LATENT_DIM        = 256
EMBED_DIM_DECODER = LATENT_DIM     # tying cómodo: D = L
DROPOUT_P         = 0.08
BATCH_SIZE        = 64
EPOCHS_PHASE1     = 30
EPOCHS_PHASE2     = 8
EPOCHS_PHASE3     = 6

# ---------- Embeddings encoder ----------
enc_emb_weights_path = os.path.join(EMB_DIR, "encoder_embedding_matrix_glove100.npy")
use_pretrained_enc = os.path.exists(enc_emb_weights_path)
if use_pretrained_enc:
    enc_embedding_matrix = np.load(enc_emb_weights_path)
    EMBED_DIM_ENCODER = enc_embedding_matrix.shape[1]
    print("Usando embeddings preentrenados del encoder:", enc_emb_weights_path)
else:
    enc_embedding_matrix = None
    print("Sin matriz preentrenada del encoder (encoder embeddings aleatorios).")

# ---------- Loss: Sparse CE + smoothing ----------
def smoothed_sparse_cce(num_classes, label_smoothing=0.02, from_logits=True):
    base = keras.losses.CategoricalCrossentropy(
        from_logits=from_logits, label_smoothing=label_smoothing
    )
    def loss_fn(y_true, y_pred):
        y_true = tf.cast(tf.squeeze(y_true, axis=-1), tf.int32)  # (N,T)
        y_true_oh = tf.one_hot(y_true, depth=num_classes)        # (N,T,C)
        return base(y_true_oh, y_pred)
    return loss_fn

# ---------- Capa de proyección ----------
@register_keras_serializable(package="Custom")
class TiedOutputProjection(keras.layers.Layer):
    def __init__(self, embedding_layer, use_bias=True, temperature=1.0, **kwargs):
        super().__init__(**kwargs)
        # Guardamos referencia al objeto capa de Embedding
        self.embedding_layer = embedding_layer
        self.use_bias = use_bias
        self.temperature = temperature
        self.bias = None
        self.E = None

    def build(self, input_shape):
        # Tomamos la variable de embeddings del decoder
        self.E = self.embedding_layer.embeddings  # (V, D)
        if self.use_bias:
            vocab_size = int(self.E.shape[0])
            self.bias = self.add_weight(
                name="bias", shape=(vocab_size,), initializer="zeros", trainable=True
            )
        super().build(input_shape)

    def call(self, x):
        logits = tf.linalg.matmul(x, tf.transpose(self.E)) / self.temperature
        if self.use_bias:
            logits = logits + self.bias
        return logits

    def get_config(self):
        cfg = super().get_config()
        cfg.update({
            "use_bias": self.use_bias,
            "temperature": self.temperature,
        })
        return cfg

# ---------- Construcción del modelo ----------
# Inputs
encoder_inputs = Input(shape=(p_in,),  name="encoder_inputs")
decoder_inputs = Input(shape=(p_out,), name="decoder_inputs")

# Encoder embedding
if use_pretrained_enc:
    enc_embedding_layer = Embedding(num_words_input, EMBED_DIM_ENCODER,
                                    weights=[enc_embedding_matrix],
                                    trainable=False, mask_zero=True, name="enc_embedding")
else:
    enc_embedding_layer = Embedding(num_words_input, EMBED_DIM_ENCODER,
                                    mask_zero=True, name="enc_embedding")
enc_emb = enc_embedding_layer(encoder_inputs)

# Encoder BiLSTM
encoder_bilstm = Bidirectional(
    LSTM(LATENT_DIM, return_sequences=True, return_state=True,
         dropout=DROPOUT_P, recurrent_dropout=DROPOUT_P),
    merge_mode="concat", name="encoder_bilstm"
)
enc_seq_bi, f_h, f_c, b_h, b_c = encoder_bilstm(enc_emb)     # (None,Tin, 2*L)

# Bridge estados -> L
bridge_h = Dense(LATENT_DIM, activation="tanh", name="bridge_h")(Concatenate()([f_h, b_h]))
bridge_c = Dense(LATENT_DIM, activation="tanh", name="bridge_c")(Concatenate()([f_c, b_c]))

# Proyección enc_seq a L para dot attention
enc_proj = Dense(LATENT_DIM, name="enc_proj")
enc_seq_proj = enc_proj(enc_seq_bi)                            # (None,Tin,L)

# Decoder embedding (trainable para tying)
decoder_embedding_layer = Embedding(num_words_output, EMBED_DIM_DECODER,
                                    mask_zero=True, name="dec_embedding", trainable=True)
dec_emb = decoder_embedding_layer(decoder_inputs)              # (None,Tout,L)

# Decoder LSTM
decoder_lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True,
                    dropout=DROPOUT_P, recurrent_dropout=DROPOUT_P, name="decoder_lstm")
dec_seq, _, _ = decoder_lstm(dec_emb, initial_state=[bridge_h, bridge_c])  # (None,Tout,L)

# Atención (Luong/dot): query=dec_seq, key/value=enc_seq_proj
attn_out = Attention(name="attention")([dec_seq, enc_seq_proj])            # (None,Tout,L)

# Fusion + normalización
context = Concatenate(name="concat_context")([dec_seq, attn_out])          # (None,Tout,2L)
context = Dense(LATENT_DIM, activation="tanh", name="proj_to_emb")(context)  # (None,Tout,L)
context = LayerNormalization(name="ln_out")(context)
context = Dropout(DROPOUT_P)(context)

tied_proj = TiedOutputProjection(
    embedding_layer=decoder_embedding_layer,
    use_bias=True,
    temperature=0.95,
    name="tied_output"
)
logits = tied_proj(context)

model = keras.Model([encoder_inputs, decoder_inputs], logits, name="seq2seq_tied_clean")

# Métrica ponderada
weighted_acc = keras.metrics.SparseCategoricalAccuracy(name="accuracy")

# ---------- FASE 1: encoder embeddings CONGELADOS ----------
opt1 = keras.optimizers.Adam(learning_rate=2e-3, clipnorm=1.0)
model.compile(
    optimizer=opt1,
    loss=smoothed_sparse_cce(num_words_output, label_smoothing=0.02, from_logits=True),
    metrics=[],
    weighted_metrics=[weighted_acc]
)
ckpt1 = os.path.join(MODEL_DIR, "best_phase1.keras")
cb1 = [
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, verbose=1, min_lr=1e-5),
    EarlyStopping(monitor="val_loss", patience=4, restore_best_weights=True, verbose=1),
    ModelCheckpoint(ckpt1, monitor="val_loss", save_best_only=True, verbose=1),
]
history1 = model.fit(
    [Xenc, Xdec], Y,
    sample_weight=sw,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS_PHASE1,
    validation_split=0.1,
    callbacks=cb1,
    verbose=1
)

# ---------- FASE 2: descongelar embeddings del encoder ----------
if use_pretrained_enc:
    enc_embedding_layer.trainable = True
    opt2 = keras.optimizers.Adam(learning_rate=5e-4, clipnorm=1.0)
    model.compile(
        optimizer=opt2,
        loss=smoothed_sparse_cce(num_words_output, label_smoothing=0.02, from_logits=True),
        metrics=[],
        weighted_metrics=[weighted_acc]
    )
    ckpt2 = os.path.join(MODEL_DIR, "best_phase2.keras")
    cb2 = [
        ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, verbose=1, min_lr=1e-5),
        EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True, verbose=1),
        ModelCheckpoint(ckpt2, monitor="val_loss", save_best_only=True, verbose=1),
    ]
    history2 = model.fit(
        [Xenc, Xdec], Y,
        sample_weight=sw,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS_PHASE2,
        validation_split=0.1,
        callbacks=cb2,
        verbose=1
    )

# ---------- FASE 3: descongelar encoder BiLSTM + enc_proj + bridges ----------
for layer_name in ["encoder_bilstm", "enc_proj", "bridge_h", "bridge_c"]:
    model.get_layer(layer_name).trainable = True

opt3 = keras.optimizers.Adam(learning_rate=2e-4, clipnorm=1.0)
model.compile(
    optimizer=opt3,
    loss=smoothed_sparse_cce(num_words_output, label_smoothing=0.02, from_logits=True),
    metrics=[],
    weighted_metrics=[weighted_acc]
)
ckpt3 = os.path.join(MODEL_DIR, "best_phase3.keras")
cb3 = [
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, verbose=1, min_lr=1e-5),
    EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True, verbose=1),
    ModelCheckpoint(ckpt3, monitor="val_loss", save_best_only=True, verbose=1),
]
history3 = model.fit(
    [Xenc, Xdec], Y,
    sample_weight=sw,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS_PHASE3,
    validation_split=0.1,
    callbacks=cb3,
    verbose=1
)

# ---------- Guardado final ----------
model.save(os.path.join(MODEL_DIR, "final.keras"))
with open(os.path.join(MODEL_DIR, "config.json"), "w", encoding="utf-8") as f:
    json.dump({
        "max_input_len": int(p_in),
        "max_out_len": int(p_out),
        "num_words_input": int(num_words_input),
        "num_words_output": int(num_words_output),
        "latent_dim": int(LATENT_DIM),
        "embed_dim_encoder": int(EMBED_DIM_ENCODER),
        "embed_dim_decoder": int(EMBED_DIM_DECODER),
        "dropout": float(DROPOUT_P),
        "label_smoothing": 0.02,
        "temperature_tying": 0.95,
        "weight_tying": True,
        "use_pretrained_encoder_embeddings": bool(use_pretrained_enc),
        "phases": [EPOCHS_PHASE1, EPOCHS_PHASE2, EPOCHS_PHASE3]
    }, f, indent=2)
print("Modelos guardados en:", MODEL_DIR)


p99 longitudes -> input: 24 | output: 60
Usando embeddings preentrenados del encoder: ./desafio4/embeddings\encoder_embedding_matrix_glove100.npy
Epoch 1/30
Epoch 1: val_loss improved from inf to 0.18105, saving model to ./desafio4/models_tied_p99_clean\best_phase1.keras
Epoch 2/30
Epoch 2: val_loss improved from 0.18105 to 0.16669, saving model to ./desafio4/models_tied_p99_clean\best_phase1.keras
Epoch 3/30
Epoch 3: val_loss improved from 0.16669 to 0.14428, saving model to ./desafio4/models_tied_p99_clean\best_phase1.keras
Epoch 4/30
Epoch 4: val_loss improved from 0.14428 to 0.13145, saving model to ./desafio4/models_tied_p99_clean\best_phase1.keras
Epoch 5/30
Epoch 5: val_loss improved from 0.13145 to 0.12501, saving model to ./desafio4/models_tied_p99_clean\best_phase1.keras
Epoch 6/30
Epoch 6: val_loss improved from 0.12501 to 0.11999, saving model to ./desafio4/models_tied_p99_clean\best_phase1.keras
Epoch 7/30
Epoch 7: val_loss improved from 0.11999 to 0.11710, saving model to

#### 5 - Inferencia
Experimentar el funcionamiento de su modelo. Recuerde que debe realizar la inferencia de los modelos por separado de encoder y decoder.

In [81]:
# ---- Diccionarios y tokens especiales
SOS, EOS, UNK = "<sos>", "<eos>", "<unk>"
PAD_IDX_IN  = 0
PAD_IDX_OUT = 0

idx2word_outputs = {idx: w for w, idx in word2idx_outputs.items()}
idx2word_inputs  = {idx: w for w, idx in word2idx_inputs.items()}
sos_id = word2idx_outputs.get(SOS, 1)
eos_id = word2idx_outputs.get(EOS, 2)
unk_in = word2idx_inputs.get(UNK, 1)

# ====================================
# 1) Modelo de INFERENCIA del ENCODER
# ====================================
enc_embedding_layer = model.get_layer("enc_embedding")
encoder_bilstm      = model.get_layer("encoder_bilstm")
enc_proj_layer      = model.get_layer("enc_proj")
bridge_h_layer      = model.get_layer("bridge_h")
bridge_c_layer      = model.get_layer("bridge_c")

encoder_inputs_inf = Input(shape=(p_in,), name="encoder_inputs_inf")
x = enc_embedding_layer(encoder_inputs_inf)
enc_seq_bi, f_h, f_c, b_h, b_c = encoder_bilstm(x)

# Puentes de estado
h_cat = Concatenate(name="concat_h_inf")([f_h, b_h])
c_cat = Concatenate(name="concat_c_inf")([f_c, b_c])
h0 = bridge_h_layer(h_cat)
c0 = bridge_c_layer(c_cat)

# Secuencia proyectada para atención
enc_seq_proj = enc_proj_layer(enc_seq_bi)

# Máscara booleana del encoder
enc_mask = tf.cast(encoder_inputs_inf != 0, tf.bool)

encoder_infer = keras.Model(
    encoder_inputs_inf, [enc_seq_proj, h0, c0, enc_mask],
    name="encoder_infer"
)
encoder_infer.summary()

# ================================================
# 2) Modelo de INFERENCIA del DECODER con máscara
# ================================================
dec_embedding_layer = model.get_layer("dec_embedding")
decoder_lstm        = model.get_layer("decoder_lstm")
attention_layer     = model.get_layer("attention")
proj_to_emb_layer   = model.get_layer("proj_to_emb")
ln_layer            = model.get_layer("ln_out")
tied_output_layer   = model.get_layer("tied_output")

# Entradas
dec_token_in    = Input(shape=(1,), name="dec_token_in")
state_h_in      = Input(shape=(decoder_lstm.units,), name="state_h_in")
state_c_in      = Input(shape=(decoder_lstm.units,), name="state_c_in")
enc_seq_proj_in = Input(shape=(p_in, decoder_lstm.units), name="enc_seq_proj_in")
enc_mask_in     = Input(shape=(p_in,), dtype="bool", name="enc_mask_in")

# Paso unario
dec_emb_step = dec_embedding_layer(dec_token_in)
dec_seq_step, h_out, c_out = decoder_lstm(dec_emb_step, initial_state=[state_h_in, state_c_in])

attn_step = attention_layer([dec_seq_step, enc_seq_proj_in], mask=[None, enc_mask_in])

# Contexto + proyección a espacio de embedding
ctx_step  = Concatenate(name="concat_ctx_step")([dec_seq_step, attn_step])
ctx_step  = proj_to_emb_layer(ctx_step)
ctx_step  = ln_layer(ctx_step)

# Logits atados + softmax
logits_step = tied_output_layer(ctx_step)
probs_step  = Activation("softmax", name="softmax_step")(logits_step)

decoder_infer = keras.Model(
    [dec_token_in, state_h_in, state_c_in, enc_seq_proj_in, enc_mask_in],
    [probs_step, h_out, c_out],
    name="decoder_infer"
)
decoder_infer.summary()

# ===============================
# 3) Helpers de preprocesamiento
# ===============================
def normalize_text(s: str) -> str:
    s = s.lower().strip()
    s = re.sub(r"([,;:\.\?!])", r" \1 ", s)
    s = re.sub(r"\s+", " ", s)
    return s

def text_to_input_ids(text: str):
    toks = normalize_text(text).split()
    ids  = [word2idx_inputs.get(t, unk_in) for t in toks]
    return pad_sequences([ids], maxlen=p_in, padding="post", truncating="post", value=PAD_IDX_IN)[0]

def ids_to_text(ids):
    words = []
    for tid in ids:
        if tid in (PAD_IDX_OUT, eos_id):
            break
        words.append(idx2word_outputs.get(int(tid), "<?>"))
    return " ".join(words)

# =======================
# 4) Copy / lexical bias
# =======================
def compute_copy_bias_ids(enc_ids):
    """Mapea palabras del input al vocab de salida y devuelve los ids a potenciar."""
    toks = [idx2word_inputs.get(int(t), None) for t in enc_ids if t != PAD_IDX_IN]
    toks = {t for t in toks if t is not None}
    out_ids = []
    for w in toks:
        oid = word2idx_outputs.get(w, None)
        if oid is not None:
            out_ids.append(int(oid))
    return list(set(out_ids))

def apply_copy_bias(probs, bias_ids, strength=0.20):
    """Multiplica por (1+strength) las probs de tokens que aparecen en el input."""
    if not bias_ids:
        return probs
    p = probs.copy()
    p[bias_ids] *= (1.0 + float(strength))
    s = p.sum()
    return p / (s + 1e-12)

# ====================================
# 5) Decoders: Greedy, Sampling, Beam
# ====================================
def _softmax(x):
    x = x - np.max(x)
    e = np.exp(x)
    return e / (e.sum() + 1e-12)

def _apply_min_len_block(probs, ban_ids, t, min_len):
    if t < min_len:
        probs = probs.copy()
        probs[ban_ids] = 0.0
        s = probs.sum()
        probs = probs if s == 0 else probs / s
    return probs

def _violates_no_repeat(next_id, generated, n):
    if n <= 0 or len(generated) < n-1:
        return False
    ngram = tuple(generated[-(n-1):] + [next_id])
    hist = tuple(generated)
    for i in range(len(hist) - (n-1)):
        if tuple(hist[i:i+n]) == ngram:
            return True
    return False

def _filter_top_k(probs, k):
    if k is None or k <= 0 or k >= len(probs): return probs
    idx = np.argpartition(probs, -k)[-k:]
    mask = np.zeros_like(probs, dtype=bool); mask[idx] = True
    out = np.where(mask, probs, 0.0); s = out.sum()
    return out / (s + 1e-12)

def _filter_top_p(probs, p):
    if p is None or p <= 0.0 or p >= 1.0: return probs
    order = np.argsort(-probs)
    cum = np.cumsum(probs[order])
    cutoff = order[cum <= p]
    if cutoff.size == 0: cutoff = order[:1]
    mask = np.zeros_like(probs, dtype=bool); mask[cutoff] = True
    out = np.where(mask, probs, 0.0); s = out.sum()
    return out / (s + 1e-12)

# ----- Greedy (con copy bias) -----
def decode_greedy(input_text: str, max_len=None, copy_bias=0.20, verbose=False):
    if max_len is None: max_len = p_out
    enc_ids = text_to_input_ids(input_text)
    enc_seq_proj, h, c, enc_mask = encoder_infer.predict(enc_ids[None, :], verbose=0)
    bias_ids = compute_copy_bias_ids(enc_ids)

    cur = np.array([[sos_id]], dtype="int32")
    out_ids = []
    for t in range(max_len):
        probs, h, c = decoder_infer.predict([cur, h, c, enc_seq_proj, enc_mask], verbose=0)
        probs = probs[0, 0]
        probs = apply_copy_bias(probs, bias_ids, strength=copy_bias)
        next_id = int(np.argmax(probs))
        if verbose:
            topk = probs.argsort()[-5:][::-1]
            print(f"t={t} -> {idx2word_outputs.get(next_id)}",
                  [(idx2word_outputs.get(int(k)), float(probs[k])) for k in topk])
        if next_id in (eos_id, PAD_IDX_OUT): break
        out_ids.append(next_id)
        cur[0,0] = next_id
    return ids_to_text(out_ids)

# ----- Sampling (temp + top-k/top-p + min_len + no_repeat_ngram + copy bias) -----
def decode_sampling(input_text: str, max_len=None, temperature=1.2, top_k=40, top_p=0.92,
                    min_len=6, no_repeat_ngram=3, copy_bias=0.20, verbose=False):
    if max_len is None: max_len = p_out
    enc_ids = text_to_input_ids(input_text)
    enc_seq_proj, h, c, enc_mask = encoder_infer.predict(enc_ids[None, :], verbose=0)
    bias_ids = compute_copy_bias_ids(enc_ids)

    cur = np.array([[sos_id]], dtype="int32")
    gen = []

    for t in range(max_len):
        probs, h, c = decoder_infer.predict([cur, h, c, enc_seq_proj, enc_mask], verbose=0)
        probs = probs[0, 0]
        probs = apply_copy_bias(probs, bias_ids, strength=copy_bias)

        # Temperatura
        if temperature != 1.0:
            logits = np.log(probs + 1e-12) / float(temperature)
            probs = _softmax(logits)

        # Bloquear EOS/PAD hasta min_len
        probs = _apply_min_len_block(probs, ban_ids=[PAD_IDX_OUT, eos_id], t=t, min_len=min_len)

        # Filtros top-k/top-p
        probs = _filter_top_k(probs, top_k)
        probs = _filter_top_p(probs, top_p)

        # Evitar n-gramas repetidos
        if no_repeat_ngram and len(gen) >= (no_repeat_ngram - 1):
            cand_order = np.argsort(-probs)
            chosen = None
            for k_id in cand_order[:50]:
                if not _violates_no_repeat(int(k_id), gen, no_repeat_ngram):
                    chosen = int(k_id); break
            if chosen is None: chosen = int(cand_order[0])
            next_id = chosen
        else:
            next_id = int(np.random.choice(len(probs), p=probs))

        if verbose:
            top5 = probs.argsort()[-5:][::-1]
            print(f"t={t} -> {idx2word_outputs.get(next_id)}",
                  [(idx2word_outputs.get(int(k)), float(probs[k])) for k in top5])

        if next_id in (eos_id, PAD_IDX_OUT): break
        gen.append(next_id)
        cur[0, 0] = next_id

    return ids_to_text(gen)

# ----- Beam Search (length penalty + min_len + no_repeat_ngram + copy bias) -----
def decode_beam(input_text: str, beam_size=5, max_len=None, lp_alpha=0.8,
                min_len=6, no_repeat_ngram=3, copy_bias=0.20):
    if max_len is None: max_len = p_out
    enc_ids = text_to_input_ids(input_text)
    enc_seq_proj, h0, c0, enc_mask = encoder_infer.predict(enc_ids[None, :], verbose=0)
    bias_ids = compute_copy_bias_ids(enc_ids)

    beams = [(0.0, [sos_id], h0, c0)]
    finished = []

    for t in range(max_len):
        new_beams = []
        for score, seq, h, c in beams:
            last = np.array([[seq[-1]]], dtype="int32")
            probs, h2, c2 = decoder_infer.predict([last, h, c, enc_seq_proj, enc_mask], verbose=0)
            probs = probs[0, 0].copy()
            probs = apply_copy_bias(probs, bias_ids, strength=copy_bias)

            # min_len: prohibir EOS/PAD
            if t < min_len:
                probs[PAD_IDX_OUT] = 0.0
                probs[eos_id] = 0.0
                s = probs.sum(); probs = probs / (s + 1e-12)

            # limitar top-k global si beam grande
            if beam_size > 5:
                probs = _filter_top_k(probs, k=50)

            topk = np.argsort(probs)[-beam_size:][::-1]
            for k_id in topk:
                k_id = int(k_id)
                if no_repeat_ngram and _violates_no_repeat(k_id, seq[1:], no_repeat_ngram):
                    continue
                p = float(probs[k_id] + 1e-12)
                new_seq = seq + [k_id]
                new_score = score - np.log(p)

                if k_id in (eos_id, PAD_IDX_OUT) or t == max_len-1:
                    lp = ((5 + len(new_seq)) / 6) ** lp_alpha
                    heapq.heappush(finished, (new_score / lp, new_seq))
                else:
                    new_beams.append((new_score, new_seq, h2, c2))

        if not new_beams and finished:
            break
        beams = sorted(new_beams, key=lambda x: x[0])[:beam_size]

    if finished:
        best = min(finished, key=lambda x: x[0])[1]
    else:
        best = beams[0][1]

    out = []
    for tid in best[1:]:
        if tid in (PAD_IDX_OUT, eos_id): break
        out.append(tid)
    return ids_to_text(out)

# ===================
# 6) Ejemplos de uso
# ===================
tests = [
    "hello how are you?",
    "Do you like going to the beach?",
    "Do you like to read?"
]
for s in tests:
    print("IN:", s)
    print("OUT greedy:",   decode_greedy(s, copy_bias=0.20))
    print("OUT beam=5:",   decode_beam(s, beam_size=5, lp_alpha=0.9, min_len=8, no_repeat_ngram=3, copy_bias=0.20))
    print("OUT sample:",   decode_sampling(s, temperature=1.2, top_k=40, top_p=0.92, min_len=8, no_repeat_ngram=3, copy_bias=0.20))
    print("---")

# ================================
# 7) Guardar modelos de inferencia
# ================================
OUT_DIR = "./desafio4/models_infer"
os.makedirs(OUT_DIR, exist_ok=True)
encoder_infer.save(f"{OUT_DIR}/encoder_infer.keras")
decoder_infer.save(f"{OUT_DIR}/decoder_infer.keras")
print("Modelos de inferencia guardados en:", OUT_DIR)


Model: "encoder_infer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs_inf (InputL  [(None, 24)]                 0         []                            
 ayer)                                                                                            
                                                                                                  
 enc_embedding (Embedding)   (None, 24, 100)              329500    ['encoder_inputs_inf[0][0]']  
                                                                                                  
 encoder_bilstm (Bidirectio  [(None, 24, 512),            731136    ['enc_embedding[3][0]']       
 nal)                         (None, 256),                                                        
                              (None, 256),                                            