# Hoja de Trabajo 2
# Deep Learning

Autores:

- Nelson García 22434
- Joaquín Puente 22296

## Ejercicio 1 - Experimentación Práctica

### Task 1: Preparación del conjunto de datos.

In [12]:
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [13]:
# Reproducibilidad
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

<torch._C.Generator at 0x7ae72a74bcd0>

In [14]:
# Dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [15]:
# 1) Cargar dataset Iris
iris = load_iris()
X = iris.data.astype(np.float32)     # sepal length/width, petal length/width
y = iris.target.astype(np.int64)     # 3 clases: setosa, versicolor, virginica
class_names = iris.target_names

In [16]:
# 2) Train/Validation split estratificado (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

In [17]:
# 3) Normalización (fit SOLO en train, aplicar en val)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train).astype(np.float32)
X_val   = scaler.transform(X_val).astype(np.float32)


In [18]:
# 4) Convertir a tensores
X_train_t = torch.from_numpy(X_train)
y_train_t = torch.from_numpy(y_train)
X_val_t   = torch.from_numpy(X_val)
y_val_t   = torch.from_numpy(y_val)

In [19]:
# 5) Crear TensorDatasets y DataLoaders
batch_size = 16
train_ds = TensorDataset(X_train_t, y_train_t)
val_ds   = TensorDataset(X_val_t, y_val_t)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

In [20]:
# 6) Info útil para el modelado posterior
input_dim  = X_train_t.shape[1]
num_classes = len(np.unique(y))

print(f"Train size: {len(train_ds)} | Val size: {len(val_ds)}")
print(f"Input dim: {input_dim} | Num classes: {num_classes}")
print("Clases:", class_names)

Train size: 120 | Val size: 30
Input dim: 4 | Num classes: 3
Clases: ['setosa' 'versicolor' 'virginica']


## Task 2 - Arquitectura modelo

In [21]:
import torch.nn as nn

In [22]:
def get_activation(name: str) -> nn.Module:
    name = name.lower()
    if name == "relu":
        return nn.ReLU(inplace=True)
    if name == "leaky_relu":
        return nn.LeakyReLU(0.1, inplace=True)
    if name == "tanh":
        return nn.Tanh()
    if name == "gelu":
        return nn.GELU()
    if name == "elu":
        return nn.ELU(inplace=True)
    return nn.ReLU(inplace=True)

In [23]:
class MLP(nn.Module):
    """
    MLP para clasificación multiclase.
    - Capa de entrada: input_dim
    - Capas ocultas: hidden_dims (tupla/lista)
    - Activación: configurable (relu, tanh, leaky_relu, gelu, elu)
    - Capa de salida: num_classes logits (sin softmax; usarás CrossEntropyLoss)
    - Opcional: BatchNorm y Dropout
    """
    def __init__(
        self,
        input_dim: int,
        hidden_dims=(16, 8),
        num_classes: int = 3,
        activation: str = "relu",
        dropout: float = 0.0,
        batchnorm: bool = False,
    ):
        super().__init__()
        layers = []
        prev = input_dim
        act_layer = get_activation(activation)

        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))
            if batchnorm:
                layers.append(nn.BatchNorm1d(h))
            # activación
            layers.append(get_activation(activation))
            # dropout opcional
            if dropout and dropout > 0:
                layers.append(nn.Dropout(p=dropout))
            prev = h

        # Capa de salida (logits)
        layers.append(nn.Linear(prev, num_classes))
        self.net = nn.Sequential(*layers)

        # Inicialización de pesos acorde a la activación
        self._init_weights(activation)

    def _init_weights(self, activation: str):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                if activation.lower() in ["relu", "leaky_relu", "elu"]:
                    nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
                elif activation.lower() in ["tanh"]:
                    nn.init.xavier_normal_(m.weight, gain=nn.init.calculate_gain("tanh"))
                else:  # gelu u otras
                    nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.net(x)

In [24]:
# --- Elecciones baselines (puedes cambiarlas para experimentar) ---
hidden_dims = (16, 8)       # Nº neuronas por capa oculta
activation  = "relu"        # "relu" | "leaky_relu" | "tanh" | "gelu" | "elu"
dropout     = 0.0           # 0.0 por ahora; lo usaremos en regularización luego
batchnorm   = False         # False por ahora

model = MLP(
    input_dim=input_dim,
    hidden_dims=hidden_dims,
    num_classes=num_classes,
    activation=activation,
    dropout=dropout,
    batchnorm=batchnorm
).to(device)

print(model)

MLP(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=16, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=16, out_features=8, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=8, out_features=3, bias=True)
  )
)


In [25]:
# Conteo de parámetros entrenables
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Parámetros entrenables: {num_params:,}")


Parámetros entrenables: 243


In [26]:
# Comprobación rápida de shapes con un batch
xb, yb = next(iter(train_loader))
xb = xb.to(device)
with torch.no_grad():
    logits = model(xb)
print("Logits shape:", logits.shape)

Logits shape: torch.Size([16, 3])


## Task 3 - Funciones de Pérdida

In [27]:
from copy import deepcopy

In [28]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

<torch._C.Generator at 0x7ae72a74bcd0>

In [29]:
def get_loss_and_transform(name: str):
    """
    Devuelve (criterion, transform) donde:
    - criterion: función de pérdida de PyTorch
    - transform(logits, y): adapta (logits, target) a lo que espera la pérdida
      y retorna (pred_for_loss, target_for_loss)
    """
    name = name.lower()
    if name in ("cross_entropy", "ce", "cross-entropy"):
        criterion = nn.CrossEntropyLoss()
        def transform(logits, y):
            # CrossEntropyLoss espera (logits, target_idx)
            return logits, y
        return criterion, transform

    if name in ("mse", "mse_loss"):
        criterion = nn.MSELoss()
        def transform(logits, y):
            # Para MSE, comparamos probabilidades softmax vs. one-hot del target
            probs = torch.softmax(logits, dim=1)
            y_onehot = torch.zeros_like(probs)
            y_onehot.scatter_(1, y.unsqueeze(1), 1.0)
            return probs, y_onehot
        return criterion, transform

    if name in ("multi_margin", "hinge", "svm"):
        # Hinge multiclass (SVM-style) incluida en PyTorch
        criterion = nn.MultiMarginLoss()  # margin=1.0 por defecto
        def transform(logits, y):
            # MultiMarginLoss espera (scores, target_idx)
            return logits, y
        return criterion, transform

    raise ValueError(f"Pérdida no soportada: {name}")



In [30]:
# Entrenamiento genérico para una pérdida dada
def train_with_loss(
    loss_name: str,
    epochs: int = 100,
    lr: float = 1e-2,
    weight_decay: float = 0.0,
):
    # Reinstanciar el modelo para cada pérdida (misma arquitectura)
    try:
        model_kwargs = dict(
            input_dim=input_dim,
            hidden_dims=hidden_dims,
            num_classes=num_classes,
            activation=activation,
            dropout=dropout,
            batchnorm=batchnorm
        )
    except NameError:
        model_kwargs = dict(
            input_dim=input_dim,
            hidden_dims=(16, 8),
            num_classes=num_classes,
            activation="relu",
            dropout=0.0,
            batchnorm=False
        )

    torch.manual_seed(RANDOM_STATE)
    np.random.seed(RANDOM_STATE)
    model = MLP(**model_kwargs).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion, transform = get_loss_and_transform(loss_name)

    history = {
        "train_loss": [], "val_loss": [],
        "train_acc":  [], "val_acc":  []
    }

    for epoch in range(epochs):
        # ---- TRAIN ----
        model.train()
        total_loss, total_correct, total_n = 0.0, 0, 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)

            pred_for_loss, target_for_loss = transform(logits, yb)
            loss = criterion(pred_for_loss, target_for_loss)

            loss.backward()
            optimizer.step()

            total_loss += loss.item() * xb.size(0)
            total_correct += (logits.argmax(dim=1) == yb).sum().item()
            total_n += xb.size(0)

        train_loss = total_loss / total_n
        train_acc  = total_correct / total_n

        # ---- EVAL ----
        model.eval()
        val_total_loss, val_correct, val_n = 0.0, 0, 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                logits = model(xb)
                pred_for_loss, target_for_loss = transform(logits, yb)
                vloss = criterion(pred_for_loss, target_for_loss)
                val_total_loss += vloss.item() * xb.size(0)
                val_correct += (logits.argmax(dim=1) == yb).sum().item()
                val_n += xb.size(0)

        val_loss = val_total_loss / val_n
        val_acc  = val_correct / val_n

        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["train_acc"].append(train_acc)
        history["val_acc"].append(val_acc)

        if (epoch + 1) % 10 == 0 or epoch == 1:
            print(f"[{loss_name}] Epoch {epoch+1:03d} | "
                  f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f} | "
                  f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}")

    return model, history

In [31]:
# ----- Ejecutar experimentos con distintas pérdidas -----
loss_names = ["cross_entropy", "mse", "multi_margin"]
epochs = 100
lr = 1e-2
weight_decay = 0.0

histories = {}
final_summary = []

for name in loss_names:
    print("\n" + "="*70)
    print(f"Entrenando con pérdida: {name}")
    print("="*70)
    model_trained, hist = train_with_loss(
        loss_name=name,
        epochs=epochs,
        lr=lr,
        weight_decay=weight_decay
    )
    histories[name] = hist
    final_summary.append({
        "loss": name,
        "final_train_loss": hist["train_loss"][-1],
        "final_val_loss":   hist["val_loss"][-1],
        "final_train_acc":  hist["train_acc"][-1],
        "final_val_acc":    hist["val_acc"][-1],
    })

print("\n--- Resumen final ---")
for row in final_summary:
    print(f"{row['loss']:>13} | "
          f"train_loss={row['final_train_loss']:.4f}, val_loss={row['final_val_loss']:.4f} | "
          f"train_acc={row['final_train_acc']:.3f}, val_acc={row['final_val_acc']:.3f}")



Entrenando con pérdida: cross_entropy
[cross_entropy] Epoch 002 | train_loss=0.7919, val_loss=0.6962 | train_acc=0.758, val_acc=0.800
[cross_entropy] Epoch 010 | train_loss=0.3986, val_loss=0.4194 | train_acc=0.900, val_acc=0.900
[cross_entropy] Epoch 020 | train_loss=0.1847, val_loss=0.2216 | train_acc=0.958, val_acc=0.933
[cross_entropy] Epoch 030 | train_loss=0.1034, val_loss=0.1541 | train_acc=0.975, val_acc=0.933
[cross_entropy] Epoch 040 | train_loss=0.0788, val_loss=0.1405 | train_acc=0.992, val_acc=0.933
[cross_entropy] Epoch 050 | train_loss=0.0584, val_loss=0.1151 | train_acc=0.983, val_acc=0.967
[cross_entropy] Epoch 060 | train_loss=0.0464, val_loss=0.1157 | train_acc=0.983, val_acc=0.967
[cross_entropy] Epoch 070 | train_loss=0.0444, val_loss=0.1085 | train_acc=0.975, val_acc=0.967
[cross_entropy] Epoch 080 | train_loss=0.0363, val_loss=0.1527 | train_acc=0.975, val_acc=0.967
[cross_entropy] Epoch 090 | train_loss=0.0253, val_loss=0.1423 | train_acc=0.992, val_acc=0.967
[