# Importaciones 

In [7]:

import os, json, math, time, hashlib, re
from collections import Counter, defaultdict
from typing import List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.nn import functional as F


# Funciones

In [None]:


def load_texts(path: str) -> List[str]:
    df = pd.read_csv(path)
    return df["Texto"].astype(str).tolist()

def split_train_val(texts: List[str], train_size=TRAIN_SIZE, val_size=VAL_SIZE, seed=SEED):
    rng = np.random.default_rng(seed)
    idx = np.arange(len(texts))
    rng.shuffle(idx)
    train_idx = idx[:train_size] if train_size > 0 else []
    val_idx   = idx[train_size:train_size+val_size] if val_size > 0 else []
    train = [texts[i] for i in train_idx]
    val   = [texts[i] for i in val_idx]
    return train, val

def tokenize_words(text: str):
    return re.findall(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ0-9]+", text.lower())

def build_vocab(texts: List[str], min_freq=1, max_size=30000):
    counter = Counter()
    for t in texts:
        counter.update(tokenize_words(t))
    stoi = {"<PAD>":0, "<UNK>":1, "<BOS>":2}
    for w, c in counter.most_common():
        if c < min_freq: break
        if len(stoi) >= max_size: break
        if w not in stoi:
            stoi[w] = len(stoi)
    itos = {i:s for s,i in stoi.items()}
    return stoi, itos

def numericalize(texts: List[str], stoi: dict) -> List[List[int]]:
    ids = []
    for t in texts:
        toks = ["<BOS>"] + tokenize_words(t)
        ids.append([stoi.get(tok, stoi["<UNK>"]) for tok in toks])
    return ids

def make_batches(seqs: List[List[int]], batch_size=BSZ, pad_id=0):
    for i in range(0, len(seqs), batch_size):
        chunk = seqs[i:i+batch_size]
        if not chunk:
            return
        mlen = max(len(x) for x in chunk)
        X = np.full((len(chunk), mlen-1), pad_id, dtype=np.int64)
        Y = np.full((len(chunk), mlen-1), pad_id, dtype=np.int64)
        for r, s in enumerate(chunk):
            X[r, :len(s)-1] = s[:-1]
            Y[r, :len(s)-1] = s[1:]
        yield X, Y, (X != pad_id)  



# funciones para KN


def train_kn_trigram(corpus: List[str]):
    
    un, bi, tri = Counter(), Counter(), Counter()
    cont_prev = defaultdict(set)
    for line in corpus:
        toks = ["<BOS>"] + tokenize_words(line)
        un.update(toks)
        for i in range(1, len(toks)):
            bi[(toks[i-1], toks[i])] += 1
        for i in range(2, len(toks)):
            tri[(toks[i-2], toks[i-1], toks[i])] += 1
            cont_prev[(toks[i-2], toks[i-1])].add(toks[i])
    return un, bi, tri, cont_prev

def prob_kn(w2, w1, w, un, bi, tri, cont_prev):
    
    c_tri = tri[(w2, w1, w)]
    c_bi  = bi[(w2, w1)]
    lambda_ = (D / max(1, c_bi)) * len(cont_prev[(w2, w1)]) if c_bi > 0 else 1.0
    cont_w  = sum(1 for (a,b,c) in tri if c == w)
    tot_cont = len(tri) if tri else 1
    p_cont = cont_w / tot_cont
    p_ml   = max(0, c_tri - D) / c_bi if c_bi > 0 else 0.0
    return p_ml + lambda_ * p_cont

def perplexity_kn(corpus: List[str], un, bi, tri, cont_prev) -> float:
    
    N, logp = 0, 0.0
    for line in corpus:
        toks = ["<BOS>"] + tokenize_words(line)
        for i in range(2, len(toks)):
            p = prob_kn(toks[i-2], toks[i-1], toks[i], un, bi, tri, cont_prev)
            logp -= math.log(max(p, 1e-12))
            N += 1
    return math.exp(logp / max(N, 1))

def run_kn(train_texts: List[str], val_texts: List[str], outdir: str = OUT_DIR) -> float:
    
    os.makedirs(outdir, exist_ok=True)
    un, bi, tri, cont_prev = train_kn_trigram(train_texts)
    perp_val = perplexity_kn(val_texts, un, bi, tri, cont_prev)
    with open(os.path.join(outdir, "kn_results.json"), "w", encoding="utf-8") as f:
        json.dump({"perplexity_val": float(perp_val)}, f, indent=2)
    print(f"[KN] Perplejidad (val): {perp_val:.3f}")
    return perp_val



#funciones para red neruronal

def make_causal_mask(T: int):
    return torch.triu(torch.ones((T, T), dtype=torch.bool), diagonal=1)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=2048):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x):
        return x + self.pe[:, : x.size(1), :]
# transformer con 2 capas y 4 cabeceras
class TransformerLM(nn.Module):

    def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=2, dim_ff=512, dropout=0.2, tie_weights=True):
        super().__init__()
        self.emb  = nn.Embedding(vocab_size, d_model)
        self.pos  = PositionalEncoding(d_model)
        layer     = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_ff,
                                               dropout=dropout, batch_first=True)
        self.enc  = nn.TransformerEncoder(layer, num_layers=num_layers)
        self.ln   = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size, bias=False)
        if tie_weights:
            self.head.weight = self.emb.weight  # weight tying

    def forward(self, x, src_key_padding_mask=None, attn_mask=None):
        h = self.emb(x)
        h = self.pos(h)
        h = self.enc(h, mask=attn_mask, src_key_padding_mask=src_key_padding_mask)
        h = self.ln(h)
        return self.head(h)

def train_transformer(train_ids: List[List[int]], val_ids: List[List[int]],
                      pad_id: int, vocab_size: int,
                      epochs=EPOCHS, bsz=BSZ, lr=LR, outdir: str = OUT_DIR,
                      device: str = None):
    os.makedirs(outdir, exist_ok=True)
    

    model = TransformerLM(vocab_size=vocab_size, d_model=256, nhead=4, num_layers=2,
                          dim_ff=512, dropout=0.2, tie_weights=True).to(device)
    opt  = torch.optim.AdamW(model.parameters(), lr=lr)
    crit = nn.CrossEntropyLoss(ignore_index=pad_id)

    hist = {"train_loss": [], "val_perplexity": [], "grad_norm": [], "tokens_per_sec": []}
    def tokens_per_sec(n, dt): return n / max(dt, 1e-9)

    for ep in range(1, epochs + 1):
        model.train()
        t0 = time.time()
        tot_loss, tot_tok, tot_grad = 0.0, 0, 0.0

        for X, Y, _ in make_batches(train_ids, batch_size=bsz, pad_id=pad_id):
            X = torch.from_numpy(X).to(device)
            Y = torch.from_numpy(Y).to(device)
            pad_mask = (X == pad_id)                 # (B, T)
            T = X.size(1)
            attn_mask = make_causal_mask(T).to(device)   # (T, T)

            logits = model(X, src_key_padding_mask=pad_mask, attn_mask=attn_mask)
            loss = crit(logits.reshape(-1, logits.size(-1)), Y.reshape(-1))

            opt.zero_grad()
            loss.backward()
            gnorm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0).item()
            tot_grad += gnorm
            opt.step()

            n_tok = int((~pad_mask).sum().item())
            tot_tok += n_tok
            tot_loss += loss.item()

        dt = time.time() - t0
        hist["tokens_per_sec"].append(tokens_per_sec(tot_tok, dt))
        hist["train_loss"].append(tot_loss)
        hist["grad_norm"].append(tot_grad)

       #Vvalidacion    
        model.eval()
        N, logp = 0, 0.0
        with torch.no_grad():
            for X, Y, _ in make_batches(val_ids, batch_size=bsz, pad_id=pad_id):
                X = torch.from_numpy(X).to(device)
                Y = torch.from_numpy(Y).to(device)
                pad_mask = (X == pad_id)
                T = X.size(1)
                attn_mask = make_causal_mask(T).to(device)

                logits = model(X, src_key_padding_mask=pad_mask, attn_mask=attn_mask)
                lp = F.log_softmax(logits, dim=-1)
                y_flat = Y.reshape(-1)
                idx = torch.arange(y_flat.numel(), device=device)
                pick = lp.reshape(-1, lp.size(-1))[idx, y_flat]
                valid = (y_flat != pad_id)
                logp -= pick[valid].sum().item()
                N += valid.sum().item()

        perp = float(math.exp(logp / max(N, 1)))
        hist["val_perplexity"].append(perp)
        print(f"[Transformer] Epoch {ep} | val_perp={perp:.2f} | tokens/s={hist['tokens_per_sec'][-1]:.1f}")
        

    #guardado de resultados
    with open(os.path.join(outdir, "transformer_results.json"), "w", encoding="utf-8") as f:
        json.dump({
            "perplexity_val": hist["val_perplexity"][-1],
            "tokens_per_sec_last": hist["tokens_per_sec"][-1]
        }, f, indent=2)

    plt.figure(); plt.plot(hist["val_perplexity"]); plt.title("Perplejidad validación"); plt.xlabel("Época"); plt.ylabel("Perplejidad"); plt.savefig(os.path.join(outdir, "val_perplexity.png"), dpi=140); plt.close()
    plt.figure(); plt.plot(hist["grad_norm"]);      plt.title("Norma gradiente (sum)");  plt.xlabel("Época"); plt.ylabel("Norma (sum)");   plt.savefig(os.path.join(outdir, "grad_norm.png"), dpi=140); plt.close()
    plt.figure(); plt.plot(hist["tokens_per_sec"]); plt.title("Tokens por segundo");     plt.xlabel("Época"); plt.ylabel("tokens/s");      plt.savefig(os.path.join(outdir, "tokens_per_sec.png"), dpi=140); plt.close()

    return hist

def compare_results(outdir: str = OUT_DIR):
    kn = json.load(open(os.path.join(outdir, "kn_results.json")))
    tr = json.load(open(os.path.join(outdir, "transformer_results.json")))
    print(f"[compare] Perplejidad KN: {kn['perplexity_val']}")
    print(f"[compare] Perplejidad Transformer: {tr['perplexity_val']}")
    return kn["perplexity_val"], tr["perplexity_val"]


# Definicioin de variables globales

In [None]:
DATA_DIR   = "../data"                     
OUT_DIR    = "../out"                     
CSV_PATH   = f"{DATA_DIR}/nlp_prueba_cc0c2.csv" 


SEED       = 42        
TRAIN_SIZE = 4000     
VAL_SIZE   = 1000     
EPOCHS     = 6        
BSZ        = 32       
LR         = 3e-4     
D          = 0.75   # descuento de Kneser–Ney

# Eejcucion de KN

In [9]:
texts = load_texts(CSV_PATH)
train_texts, _ = split_train_val(texts, train_size=TRAIN_SIZE, val_size=0, seed=SEED)
_, val_texts   = split_train_val(texts, train_size=0, val_size=VAL_SIZE, seed=SEED)

un, bi, tri, cont_prev = train_kn_trigram(train_texts)
perp_kn = perplexity_kn(val_texts, un, bi, tri, cont_prev)
print(f"[KN] Perplejidad (val): {perp_kn:.4f}")

with open(os.path.join(OUT_DIR, "kn_results.json"), "w", encoding="utf-8") as f:
    json.dump({"perplexity_val": float(perp_kn)}, f, indent=2)

[KN] Perplejidad (val): 1.5115


# Ejecucion del transformer

In [10]:

texts = load_texts(CSV_PATH)
train_texts, _ = split_train_val(texts, train_size=TRAIN_SIZE, val_size=0, seed=SEED)
_, val_texts   = split_train_val(texts, train_size=0, val_size=VAL_SIZE, seed=SEED)

stoi, _  = build_vocab(train_texts)
train_ids = numericalize(train_texts, stoi)
val_ids   = numericalize(val_texts,   stoi)
pad_id     = stoi["<PAD>"]
vocab_size = len(stoi)


hist = train_transformer(train_ids, val_ids, pad_id, vocab_size,
                         epochs=EPOCHS, bsz=BSZ, lr=LR, outdir=OUT_DIR)


[Transformer] Epoch 1 | val_perp=2.49 | tokens/s=13396.3
[Transformer] Epoch 2 | val_perp=2.63 | tokens/s=13942.2
[Transformer] Epoch 3 | val_perp=1.84 | tokens/s=14258.3
[Transformer] Epoch 4 | val_perp=1.79 | tokens/s=13923.4
[Transformer] Epoch 5 | val_perp=1.73 | tokens/s=14365.1
[Transformer] Epoch 6 | val_perp=1.69 | tokens/s=14368.1


# Comparacion 

In [11]:

if os.path.exists(os.path.join(OUT_DIR, "kn_results.json")) and os.path.exists(os.path.join(OUT_DIR, "transformer_results.json")):
    kn = json.load(open(os.path.join(OUT_DIR, "kn_results.json")))
    tr = json.load(open(os.path.join(OUT_DIR, "transformer_results.json")))
    print(f"Perplejidad KN: {kn['perplexity_val']}")
    print(f"Perplejidad Transformer: {tr['perplexity_val']}")

    plt.figure()
    plt.bar(["KN trigram", "Transformer"], [kn["perplexity_val"], tr["perplexity_val"]])
    for i, v in enumerate([kn["perplexity_val"], tr["perplexity_val"]]):
        plt.text(i, v, f"{v:.3f}", ha="center", va="bottom")
    plt.title("Perplejidad final en validación"); plt.ylabel("Perplejidad")
    plt.savefig(os.path.join(OUT_DIR, "compare_perplexity.png"), dpi=140); plt.close()
else:
    print("Corre primero 'Ejecución KN' y/o 'Ejecución Transformer' para comparar.")


Perplejidad KN: 1.5115120047949142
Perplejidad Transformer: 1.6940707992331128
