In [3]:
# =============================================================================
# NOTEBOOK 2: ARQUITECTURA Y ENTRENAMIENTO (VERSIÓN COMPLETA Y CORREGIDA)
# =============================================================================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import pickle
import os

# -----------------------------------------------------------------------------
# CONFIGURACIÓN GENERAL
# -----------------------------------------------------------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DATA_DIR = "data_processed"
MODELS_DIR = "models"

os.makedirs(MODELS_DIR, exist_ok=True)

# Hiperparámetros
MAX_LEN = 40                 # Se modificará en Experimento 2
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
N_LAYERS = 2
DROPOUT = 0.3
BATCH_SIZE = 64
LR = 1e-3
EPOCHS = 12                 # Un poco más alto para estabilizar BiLSTM
CLIP = 1.0                  # Gradient clipping
PATIENCE = 3               # Early stopping

print(f"Usando dispositivo: {DEVICE}")

# -----------------------------------------------------------------------------
# 1. CARGA DE DATOS Y VOCABULARIO
# -----------------------------------------------------------------------------
print("Cargando datos procesados...")

vocab = pickle.load(open(f"{DATA_DIR}/vocab.pkl", "rb"))
train_df = pd.read_csv(f"{DATA_DIR}/train.csv")
val_df = pd.read_csv(f"{DATA_DIR}/val.csv")

PAD_IDX = vocab["<PAD>"]
UNK_IDX = vocab["<UNK>"]

# -----------------------------------------------------------------------------
# 2. DATASET Y ENCODING
# -----------------------------------------------------------------------------
def basic_tokenize(text):
    return str(text).lower().split()

def encode_text(text, vocab):
    tokens = basic_tokenize(text)
    ids = [vocab.get(tok, UNK_IDX) for tok in tokens]
    return torch.tensor(ids, dtype=torch.long)

class FinancialTweetsDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]["text"]
        label = int(self.df.iloc[idx]["label"])
        encoded = encode_text(text, self.vocab)
        return encoded, label

def collate_fn(batch):
    """Padding dinámico para secuencias variables."""
    texts, labels = zip(*batch)
    lengths = [len(t) for t in texts]
    max_len = min(MAX_LEN, max(lengths))

    padded = []
    for seq in texts:
        seq = seq[:max_len]
        if len(seq) < max_len:
            seq = torch.cat([seq, torch.tensor([PAD_IDX] * (max_len - len(seq)))])
        padded.append(seq)

    padded = torch.stack(padded)
    labels = torch.tensor(labels, dtype=torch.long)
    lengths = torch.tensor([min(l, max_len) for l in lengths], dtype=torch.long)

    return padded, labels, lengths

train_ds = FinancialTweetsDataset(train_df, vocab)
val_ds = FinancialTweetsDataset(val_df, vocab)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, collate_fn=collate_fn)

print("Dataloaders listos.")

# -----------------------------------------------------------------------------
# 3. ATENCIÓN + MODELOS RNN (LSTM, GRU, BiLSTM)
# -----------------------------------------------------------------------------
class AttentionPooling(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, rnn_output, mask):
        # rnn_output: (batch, seq, hidden)
        scores = self.attn(rnn_output).squeeze(-1)        # (batch, seq)
        scores = scores.masked_fill(mask == 0, -1e9)      # Padding → -inf
        attn_weights = torch.softmax(scores, dim=1)
        context = torch.sum(rnn_output * attn_weights.unsqueeze(-1), dim=1)
        return context

class RecurrentClassifier(nn.Module):
    def __init__(self, model_type, vocab_size, embed_dim, hidden_dim, out_dim, n_layers, dropout, pad_idx):
        super().__init__()
        self.model_type = model_type

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)

        is_bi = (model_type == "bilstm")
        num_directions = 2 if is_bi else 1

        if model_type in ["lstm", "bilstm"]:
            self.rnn = nn.LSTM(embed_dim, hidden_dim, n_layers,
                               batch_first=True, dropout=dropout if n_layers > 1 else 0,
                               bidirectional=is_bi)
        else:
            self.rnn = nn.GRU(embed_dim, hidden_dim, n_layers,
                              batch_first=True, dropout=dropout if n_layers > 1 else 0,
                              bidirectional=is_bi)

        self.attention = AttentionPooling(hidden_dim * num_directions)
        self.fc = nn.Linear(hidden_dim * num_directions, out_dim)
        self.dropout = nn.Dropout(dropout)
        self.pad_idx = pad_idx

    def forward(self, x, lengths):
        mask = (x != self.pad_idx).float()  # (batch, seq)

        emb = self.embedding(x)             # (batch, seq, embed)

        # Empaquetado
        packed = nn.utils.rnn.pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        packed_out, _ = self.rnn(packed)
        rnn_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)

        # Atención
        context = self.attention(rnn_out, mask)
        logits = self.fc(self.dropout(context))
        return logits

# -----------------------------------------------------------------------------
# 4. FUNCIÓN DE ENTRENAMIENTO Y EVALUACIÓN
# -----------------------------------------------------------------------------
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, correct, count = 0, 0, 0

    for xb, yb, lengths in loader:
        xb, yb, lengths = xb.to(DEVICE), yb.to(DEVICE), lengths.to(DEVICE)

        optimizer.zero_grad()
        logits = model(xb, lengths)
        loss = criterion(logits, yb)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()

        total_loss += loss.item() * xb.size(0)
        correct += (logits.argmax(1) == yb).sum().item()
        count += xb.size(0)

    return total_loss / count, correct / count

def evaluate(model, loader, criterion):
    model.eval()
    total_loss, correct, count = 0, 0, 0

    with torch.no_grad():
        for xb, yb, lengths in loader:
            xb, yb, lengths = xb.to(DEVICE), yb.to(DEVICE), lengths.to(DEVICE)
            logits = model(xb, lengths)
            loss = criterion(logits, yb)

            total_loss += loss.item() * xb.size(0)
            correct += (logits.argmax(1) == yb).sum().item()
            count += xb.size(0)

    return total_loss / count, correct / count

# -----------------------------------------------------------------------------
# 5. PÉRDIDA PONDERADA (DESBALANCE REAL)
# -----------------------------------------------------------------------------
counts = train_df["label"].value_counts().sort_index()
weights = 1.0 / counts
weights = weights / weights.sum()
class_weights = torch.tensor(weights.values, dtype=torch.float).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=class_weights)

print("Pesos por clase:", weights.values)

# -----------------------------------------------------------------------------
# 6. ENTRENAMIENTO DE MODELOS (LSTM, GRU, BiLSTM)
# -----------------------------------------------------------------------------
histories = {}

for m_type in ["lstm", "gru", "bilstm"]:
    print(f"\nEntrenando modelo: {m_type.upper()}")
    print("=" * 50)

    model = RecurrentClassifier(
        model_type=m_type,
        vocab_size=len(vocab),
        embed_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        out_dim=3,
        n_layers=N_LAYERS,
        dropout=DROPOUT,
        pad_idx=PAD_IDX
    ).to(DEVICE)

    optimizer = optim.AdamW(model.parameters(), lr=LR)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

    best_loss = float("inf")
    patience_counter = 0

    history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}

    for ep in range(EPOCHS):
        tl, ta = train_epoch(model, train_loader, optimizer, criterion)
        vl, va = evaluate(model, val_loader, criterion)

        scheduler.step()

        history["train_loss"].append(tl)
        history["val_loss"].append(vl)
        history["train_acc"].append(ta)
        history["val_acc"].append(va)

        print(f"Epoch {ep+1}/{EPOCHS} | "
              f"Train Loss: {tl:.4f}, Acc: {ta:.4f} | "
              f"Val Loss: {vl:.4f}, Acc: {va:.4f}")

        # Early stopping
        if vl < best_loss:
            best_loss = vl
            patience_counter = 0

            torch.save({
                "model_state": model.state_dict(),
                "vocab": vocab,
                "config": {
                    "model_type": m_type,
                    "embed_dim": EMBEDDING_DIM,
                    "hidden_dim": HIDDEN_DIM,
                    "n_layers": N_LAYERS,
                    "dropout": DROPOUT
                }
            }, f"{MODELS_DIR}/{m_type}_best_model.pth")

            print(f" --> Guardado mejor modelo en models/{m_type}_best_model.pth")

        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(" --> Early stopping activado")
                break

    histories[m_type] = history

# Guardar historial completo
with open(f"{MODELS_DIR}/history.pkl", "wb") as f:
    pickle.dump(histories, f)

print("\n¡Entrenamiento finalizado para LSTM, GRU y BiLSTM!")


Usando dispositivo: cuda
Cargando datos procesados...
Dataloaders listos.
Pesos por clase: [0.50567778 0.37744553 0.11687668]

Entrenando modelo: LSTM
Epoch 1/12 | Train Loss: 1.0434, Acc: 0.5193 | Val Loss: 0.9537, Acc: 0.6119
 --> Guardado mejor modelo en models/lstm_best_model.pth
Epoch 2/12 | Train Loss: 0.8496, Acc: 0.6748 | Val Loss: 0.8434, Acc: 0.6650
 --> Guardado mejor modelo en models/lstm_best_model.pth
Epoch 3/12 | Train Loss: 0.6771, Acc: 0.7481 | Val Loss: 0.8103, Acc: 0.6884
 --> Guardado mejor modelo en models/lstm_best_model.pth
Epoch 4/12 | Train Loss: 0.5027, Acc: 0.8130 | Val Loss: 0.8373, Acc: 0.7252
Epoch 5/12 | Train Loss: 0.4157, Acc: 0.8527 | Val Loss: 0.8468, Acc: 0.7160
Epoch 6/12 | Train Loss: 0.3510, Acc: 0.8746 | Val Loss: 0.8767, Acc: 0.7337
 --> Early stopping activado

Entrenando modelo: GRU
Epoch 1/12 | Train Loss: 1.0058, Acc: 0.5239 | Val Loss: 0.9505, Acc: 0.6551
 --> Guardado mejor modelo en models/gru_best_model.pth
Epoch 2/12 | Train Loss: 0.815