In [1]:
"""%pip install --quiet --upgrade pip
!pip uninstall -y numpy && pip install "numpy<2.0"
%pip install scikit-learn xgboost shap optuna
%pip install skorch optuna scikit-learn xgboost
%pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126"""
%pip install catboost pytorch-tabnet

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer  # noqa: F401
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score

from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from pytorch_tabnet.tab_model import TabNetClassifier  # :contentReference[oaicite:2]{index=2}

from skorch import NeuralNetBinaryClassifier
from skorch.callbacks import EarlyStopping

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Carga tus CSV ya procesados
train = pd.read_csv("../data/train_final.csv")
test  = pd.read_csv("../data/test_final.csv")


ID_COLS  = ["PassengerId", "Name"]          # no entran al modelo
TARGET   = "Transported"                    # 0 / 1

In [4]:
# 🔍 1.1 – Columnas numéricas “continuas”
num_cont = [
    "Age", "RoomService", "FoodCourt", "ShoppingMall",
    "Spa", "VRDeck", "CabinNum", "CabinNum_scaled",
    "TotalSpent"
]

# 🔍 1.2 – Enteros / ordinales a imputar con moda
num_cat_int = ["CabinDeck", "CabinSide", "Group"]

# 🔍 1.3 – Booleanos (0/1) – también modo
num_bool = [
    "HomePlanet_Europa", "HomePlanet_Mars",
    "CryoSleep_True",
    "Destination_PSO J318.5-22", "Destination_TRAPPIST-1e",
    "VIP", "HasSpent"
]

In [5]:
# 2.2) Imputadores y escalador robusto
iter_imp = IterativeImputer(
    estimator=ExtraTreesRegressor(n_estimators=40, max_depth=8, random_state=42),
    max_iter=10,
    initial_strategy='median',
    random_state=42
)  # Imputación iterativa para continuas :contentReference[oaicite:5]{index=5}

mode_imp = SimpleImputer(strategy='most_frequent')  # moda para categóricas/booleanas

# 2.3) ColumnTransformer con imputación y escalado
preprocess = ColumnTransformer([
    ("num_cont",   Pipeline([("imp", iter_imp), ("sc", RobustScaler())]), num_cont),
    ("num_cat_int", mode_imp, num_cat_int),
    ("num_bool",    mode_imp, num_bool)
], remainder="drop")


In [6]:
y = train[TARGET].astype("float32")
X = train.drop(columns=ID_COLS + [TARGET])

# Si por alguna razón se han quedado booleanos, pásalos a int
bool_cols = X.select_dtypes("bool").columns
X[bool_cols] = X[bool_cols].astype(int)

In [7]:
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

In [8]:
class DenseNet(nn.Module):
    """
    Red totalmente densa (feedforward) con:
      - n_layers ocultas
      - hidden_units por capa
      - dropout opcional
    """
    def __init__(self, input_dim, n_layers=2, hidden_units=128, dropout=0.0):
        super().__init__()
        layers = []
        in_dim = input_dim
        for _ in range(n_layers):
            layers += [
                nn.Linear(in_dim, hidden_units),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
            in_dim = hidden_units
        layers.append(nn.Linear(in_dim, 1))  # salida de logits
        self.net = nn.Sequential(*layers)

    def forward(self, X):
        return self.net(X.float()).squeeze(1)  # logits sin sigmoid

def make_dnn(trial):
    """
    Construye un NeuralNetBinaryClassifier con:
      - BCEWithLogitsLoss
      - EarlyStopping de Skorch
      - Optimizer Adam
    """
    n_layers = trial.suggest_int("pt_layers", 1, 10)
    hid_units = trial.suggest_int("pt_units", 64, 2048, step=32)
    dropout = trial.suggest_float("pt_dropout", 0.0, 0.5)
    lr = trial.suggest_float("pt_lr", 1e-8, 1e-2, log=True)
    batch = trial.suggest_categorical("pt_batch", [16, 32, 64, 128, 256, 512])

    return NeuralNetBinaryClassifier(
        module=DenseNet,
        module__input_dim=len(num_cont) + len(num_cat_int) + len(num_bool),
        module__n_layers=n_layers,
        module__hidden_units=hid_units,
        module__dropout=dropout,
        lr=lr,
        batch_size=batch,
        max_epochs=800,  # tope razonable por fold
        optimizer=torch.optim.Adam,
        criterion=nn.BCEWithLogitsLoss,  # más estable que BCELoss + sigmoid 
        callbacks=[EarlyStopping(patience=15, monitor="valid_loss", lower_is_better=True)],
        device="cuda" if torch.cuda.is_available() else "cpu",
        verbose=0
    )

In [9]:
def make_tabnet(trial):
    """
    Construye un TabNetClassifier. 
    NOTA: los parámetros de early stopping (max_epochs, patience) se pasan al fit(), no al constructor.
    """
    return TabNetClassifier(
        # ───────────────────────────── Arquitectura ────────────────────────────
        n_d=trial.suggest_int("tn_nd", 8, 32, step=8),
        n_a=trial.suggest_int("tn_na", 8, 32, step=8),
        n_steps=trial.suggest_int("tn_steps", 3, 8),
        gamma=trial.suggest_float("tn_gamma", 1.0, 2.0),
        lambda_sparse=trial.suggest_float("tn_sparse", 1e-5, 1e-3, log=True),

        # ─────────────────────── Optimizer y Scheduler ───────────────────────
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=trial.suggest_float("tn_lr", 1e-8, 1e-2, log=True)),
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        scheduler_params=dict(gamma=0.95, step_size=10),

        # ───────────────────────────── Otros ─────────────────────────────
        seed=42,
        verbose=0,
        device_name="cuda" if torch.cuda.is_available() else "cpu",
    )


In [10]:
def make_catboost(trial):
    return CatBoostClassifier(
        iterations=trial.suggest_int("cb_iter", 200, 1000),
        learning_rate=trial.suggest_float("cb_lr", 1e-6, 0.1, log=True),
        depth=trial.suggest_int("cb_depth", 4, 16),
        l2_leaf_reg=trial.suggest_float("cb_l2", 1.0, 10.0, log=True),
        eval_metric="Accuracy",  # CatBoost maneja la métrica internamente
        random_seed=42,
        verbose=False,
    )  # 

In [11]:
def objective(trial):
    # 4.1) Sugerir el modelo a usar (espacio de búsqueda fijo)
    model_name = trial.suggest_categorical(
        "model", ["GradientBoosting", "RandomForest", "XGBoost", "CatBoost", "TabNet", "DNN_PT"]
    )  # :contentReference[oaicite:16]{index=16}

    # 4.2) Sugerir hiperparámetros del modelo ANTES del bucle de CV (espacio estático)
    # ────────────────────────────────────────────────────────────────────────────
    if model_name == "GradientBoosting":
        gb_n_estimators = trial.suggest_int("gb_n_estimators", 100, 800)
        gb_lr = trial.suggest_float("gb_lr", 0.01, 0.2, log=True)
        gb_max_depth = trial.suggest_int("gb_max_depth", 2, 5)
        gb_subsample = trial.suggest_float("gb_subsample", 0.6, 1.0)
        model = GradientBoostingClassifier(
            random_state=42,
            n_estimators=gb_n_estimators,
            learning_rate=gb_lr,
            max_depth=gb_max_depth,
            subsample=gb_subsample
        )

    elif model_name == "RandomForest":
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 200, 1000)
        rf_max_depth = trial.suggest_int("rf_max_depth", 4, 20)
        rf_split = trial.suggest_int("rf_split", 2, 10)
        rf_leaf = trial.suggest_int("rf_leaf", 1, 5)
        rf_feats = trial.suggest_categorical("rf_feats", ["sqrt", "log2", None])
        model = RandomForestClassifier(
            random_state=42, n_jobs=-1,
            n_estimators=rf_n_estimators,
            max_depth=rf_max_depth,
            min_samples_split=rf_split,
            min_samples_leaf=rf_leaf,
            max_features=rf_feats
        )

    elif model_name == "XGBoost":
        xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 300, 1200)
        xgb_lr = trial.suggest_float("xgb_lr", 1e-8, 0.2, log=True)
        xgb_max_depth = trial.suggest_int("xgb_max_depth", 3, 20)
        xgb_subsample = trial.suggest_float("xgb_subsample", 0.6, 1.0)
        xgb_colsample = trial.suggest_float("xgb_colsample", 0.5, 1.0)
        xgb_gamma = trial.suggest_float("xgb_gamma", 0, 10)
        xgb_l2 = trial.suggest_float("xgb_l2", 1e-6, 10, log=True)
        model = XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            n_jobs=1,
            random_state=42,
            n_estimators=xgb_n_estimators,
            learning_rate=xgb_lr,
            max_depth=xgb_max_depth,
            subsample=xgb_subsample,
            colsample_bytree=xgb_colsample,
            gamma=xgb_gamma,
            reg_lambda=xgb_l2,
            tree_method="hist",
            device="cpu"  # Cambia a "cuda" si tienes GPU
        )

    elif model_name == "CatBoost":
        model = make_catboost(trial)  # CatBoost sugiere internamente sus hiperparámetros

    elif model_name == "TabNet":
        tn_max_epochs = trial.suggest_int("tn_max_epochs", 50, 1000)
        tn_patience = trial.suggest_int("tn_patience", 10, 50)
        model = make_tabnet(trial)

    else:  # DNN_PT
        model = make_dnn(trial)

    # 4.3) Validación cruzada manual (5 folds estratificados)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train_fold = X.iloc[train_idx]
        y_train_fold = y.iloc[train_idx]
        X_val_fold = X.iloc[val_idx]
        y_val_fold = y.iloc[val_idx]

        # Definir pipeline con preprocesamiento + modelo
        pipe = Pipeline([("prep", preprocess), ("clf", model)])

        if model_name == "TabNet":
            # 4.4) Early stopping y evaluación en cada fold para TabNet
            X_val_trans = pipe.named_steps["prep"].transform(X_val_fold)
            pipe.fit(
                X_train_fold, y_train_fold,
                **{
                    "clf__eval_set": [(X_val_trans, y_val_fold.to_numpy())],
                    "clf__eval_name": ["val"],
                    "clf__eval_metric": ["accuracy"],
                    "clf__max_epochs": tn_max_epochs,
                    "clf__patience": tn_patience,
                }
            )
            preds = pipe.predict(X_val_fold)
            score = accuracy_score(y_val_fold, preds)

        elif model_name == "DNN_PT":
            # 4.5) Skorch ya tiene EarlyStopping configurado en make_dnn
            pipe.fit(X_train_fold, y_train_fold)
            preds = pipe.predict(X_val_fold)
            score = accuracy_score(y_val_fold, preds)

        else:
            # 4.6) Modelos de árboles y boosting sin early stopping interno
            pipe.fit(X_train_fold, y_train_fold)
            preds = pipe.predict(X_val_fold)
            score = accuracy_score(y_val_fold, preds)

        fold_scores.append(score)

    return np.mean(fold_scores)


In [None]:
# 7.1) Configurar Optuna para guardar en SQLite (opcional)
# El parámetro "storage" permite recuperar resultados entre sesiones.
study = optuna.create_study(
    direction="maximize", 
    storage="sqlite:///optuna.db", 
    load_if_exists=True, 
    study_name="titanic_optimization_v2"
)

# 7.2) Lanzar la optimización (p. ej. 200 trials para demo; aumenta a 1000+ si tienes tiempo)
study.optimize(objective, n_trials=2000, show_progress_bar=True)

# 7.3) Resultados
print(f"Best trial: {study.best_trial.number}")
print(f"Best value: {study.best_trial.value:.5f}")
print("Best params:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

[I 2025-06-05 00:43:20,805] Using an existing study with name 'titanic_optimization_v2' instead of creating a new one.
  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
print("Mejor accuracy:", study.best_value)
print("Mejores params:", study.best_params)

# Tabla comparativa por modelo
df_hist = study.trials_dataframe(attrs=("value","params"))
df_hist["model"] = df_hist["params"].apply(lambda d: d["model"])
print(df_hist.groupby("model")["value"].agg(["count","max","mean"]))

Mejor accuracy: 0.8164036180734268
Mejores params: {'model': 'CatBoost', 'cb_iter': 631, 'cb_lr': 0.023321158701556228, 'cb_depth': 7, 'cb_l2': 3.4047591260661614}


KeyError: 'params'

In [None]:
best = study.best_params
model_name = best.pop("model")

if model_name == "GradientBoosting":
    best_model = GradientBoostingClassifier(random_state=42, **{
        k.split("gb_")[1]: v for k, v in best.items()
    })
elif model_name == "RandomForest":
    best_model = RandomForestClassifier(random_state=42, n_jobs=-1, **{
        k.split("rf_")[1]: v for k, v in best.items()
    })
elif model_name == "DNN_PT":
    best_model = NeuralNetClassifier(
        module=DenseNet,
        module__input_dim=X.shape[1],
        module__n_layers=best["pt_layers"],
        module__hidden_units=best["pt_units"],
        module__dropout=best["pt_dropout"],
        max_epochs=50,
        lr=best["pt_lr"],
        batch_size=best["pt_batch"],
        optimizer=torch.optim.Adam,
        criterion=nn.BCELoss,
        callbacks=[EarlyStopping(patience=5, monitor="valid_loss",
                                 lower_is_better=True)],
        device="cuda",
        verbose=0
    )
else:
    best_model = XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        n_jobs=-1,
        random_state=42,
        **{k.split("xgb_")[1]: v for k, v in best.items()}
    )

final_pipe = Pipeline([("prep", preprocess),
                       ("clf",  best_model)])
final_pipe.fit(X, y)

In [None]:
import matplotlib.pyplot as plt

names_cont = num_cont
names_bool = num_bool
feat_names = names_cont + names_bool

importances = final_pipe["clf"].feature_importances_
pd.Series(importances, index=feat_names).sort_values().tail(15).plot.barh()
plt.title(f"Top features – {model_name}"); plt.show()

In [None]:
X_test = test.drop(columns=ID_COLS)
preds   = (final_pipe.predict_proba(X_test)[:,1] >= 0.5).astype(bool)
pd.DataFrame({"PassengerId": test["PassengerId"],
              "Transported": preds}).to_csv("submission.csv", index=False)