<a href="https://colab.research.google.com/github/noahfavreau/nasa-space-apps-2025/blob/main/model_architecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier, cv, Pool

from pytorch_tabnet.tab_model import TabNetClassifier

import lightgbm as lgb

from sklearn.linear_model import LogisticRegressionCV

import xgboost as xgb

from xgboost import plot_importance, XGBClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

import random

import numpy as np

import pickle

import optuna

RANDOM_SEED = 67

In [None]:
def train_catboost(X_train, X_val, y_train, y_val, params, fold):
  params.update({
      'task_type': "GPU",
      'devices' : '0',
      'loss_function' : 'MultiClass',
      'eval_metric' : 'MultiClass',
      'random_state' : RANDOM_SEED + fold,
      'verbose' : 0
  })

  model = CatBoostClassifier(**params)

  train_pool = Pool(X_train, y_train)
  val_pool = Pool(X_val, y_val)

  model.fit(train_pool, eval_set=val_pool,
            early_stopping_rounds=20,
            use_best_model=True)

  predictions = model.predict(X_val)
  accuracy = accuracy_score(y_val, predictions)

  return model, predictions, accuracy

In [None]:
def train_lgb(X_train, X_val, y_train, y_val, params, fold):
  params.update({
    'objective' : 'multiclass',
    'num_class' : 3,
    'metric' : "multi_logloss",
    'verbose' : -1,
    'seed' : RANDOM_SEED + fold
  })

  train_data = lgb.Dataset(X_train, label=y_train)
  val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

  model = lgb.train(params,
                      train_data,
                      num_boost_round=200,
                      callbacks=[lgb.early_stopping(stopping_rounds=20)],
                      valid_sets=[val_data]
                        )

  preds_proba = model.predict(X_val,
                           num_iteration=lgb_model.best_iteration)

  predictions = np.argmax(preds_proba, axis=1)
  accuracy = accuracy_score(y_val, predictions)

  return model, predictions, accuracy

In [None]:
def train_xgb(X_train, X_val, y_train, y_val, params, fold):
  params.update({
      'objective' : 'multi:softmax',
      'num_class': 3,
      'eval_metric' : 'mlogloss',
      'use_label_encoder' : False,
      'random_state' : RANDOM_SEED + fold,
      'verbosity' : 0
  })

  xgb_model = XGBClassifier(**params)

  xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)],
                early_stopping_rounds=20)

  predictions = xgb_model.predict(X_val)
  accuracy = accuracy_score(y_val, predictions)

  return xgb_model, predictions, accuracy

In [None]:
def train_tabnet(X_train, X_val, y_train, y_val, params, fold):

  params.update({
      'seed' : RANDOM_SEED + fold,
      'verbose' : 0
  })

  tabnet_model = TabNetClassifier(**params)

  tabnet_model.fit(X_train.values, y_train.values,
                   eval_set=[(X_val.values, y_val.values)],
                   eval_metric=["accuracy"],
                   max_epochs=200,
                   patience=20,
                   batch_size=1024,
                   virtual_batch_size=128
                   )

  predictions = tabnet_model.predict(X_val.values)
  accuracy = accuracy_score(y_val, predictions)

  return tabnet_model, predictions, accuracy


In [None]:
def objective_catboost(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 500),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "task_type": "GPU",
        "devices": "0",
        "loss_function": "MultiClass",
        "random_seed": RANDOM_SEED
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracies = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        model = CatBoostClassifier(**params, verbose=0)
        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=20, verbose=0)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        accuracies.append(acc)

    return np.mean(accuracies)


In [None]:
def objective_lgb(trial):
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "metric": "multi_logloss",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        "max_depth": trial.suggest_int("max_depth", -1, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracies = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val)
        model = lgb.train(params, train_data, valid_sets=[val_data], early_stopping_rounds=20, verbose_eval=False)

        preds = np.argmax(model.predict(X_val), axis=1)
        acc = accuracy_score(y_val, preds)
        accuracies.append(acc)

    return np.mean(accuracies)


In [None]:
def objective_xgb(trial):
    params = {
        "objective": "multi:softmax",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "random_state": RANDOM_SEED
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracies = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        model = XGBClassifier(**params, use_label_encoder=False)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        accuracies.append(acc)

    return np.mean(accuracies)


In [None]:
def objective_tabnet(trial):
    params = {
        "n_d": trial.suggest_int("n_d", 8, 64, step=8),
        "n_a": trial.suggest_int("n_a", 8, 64, step=8),
        "n_steps": trial.suggest_int("n_steps", 3, 10),
        "gamma": trial.suggest_float("gamma", 1.0, 2.0, step=0.1),
        "lambda_sparse": trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True),
        "optimizer_fn": trial.suggest_categorical("optimizer_fn", ["adam", "adamw"]),
        "optimizer_params": dict(
            lr=trial.suggest_float("lr", 1e-4, 1e-2, log=True)
        ),
        "momentum": trial.suggest_float("momentum", 0.01, 0.4),
        "seed": RANDOM_SEED,
        "verbose": 0
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracies = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        model = TabNetClassifier(**params)

        model.fit(
            X_train.values, y_train.values,
            eval_set=[(X_val.values, y_val.values)],
            eval_metric=["accuracy"],
            max_epochs=200,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128
        )

        preds = model.predict(X_val.values)
        acc = accuracy_score(y_val, preds)
        accuracies.append(acc)

    return np.mean(accuracies)

In [None]:
X = None
y = None

catboost_fold_accuracies = []
xgb_fold_accuracies = []
lgb_fold_accuracies = []
tabnet_fold_accuracies = []

catboost_oof_preds = np.zeros(len(X))
xgb_oof_preds = np.zeros(len(X))
lgb_oof_preds = np.zeros(len(X))
tabnet_oof_preds = np.zeros(len(X))

study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(objective_catboost, n_trials=30)
print("Best CatBoost params:", study_cat.best_params)

study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=30)
print("Best LightGBM params:", study_lgb.best_params)

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=30)
print("Best XGBoost params:", study_xgb.best_params)

study_tabnet = optuna.create_study(direction="maximize")
study_tabnet.optimize(objective_tabnet, n_trials=30)
print("Best Tabnet params:", study_tabnet.best_params)

kf = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)

for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Train CatBoost with best params
    cb_model, cb_preds, cb_acc = train_catboost(
        X_train, X_val, y_train, y_val,
        study_cat.best_params,  # From Optuna
        fold_idx
    )
    catboost_fold_accuracies.append(cb_acc)
    catboost_oof_preds[val_idx] = cb_preds
    print(f"CatBoost Fold {fold_idx}: {cb_acc:.4f}")

    lgb_model, lgb_preds, lgb_acc = train_lgb(
        X_train, X_val, y_train, y_val,
        study_lgb.best_params,
        fold_idx
    )
    lgb_fold_accuracies.append(lgb_acc)
    lgb_oof_preds[val_idx] =lgb_preds
    print(f"LightGBM Fold {fold_idx}: {lgb_acc:.4f}")

    xgb_model, xgb_preds, xgb_acc = train_xgb(
        X_train, X_val, y_train, y_val,
        study_xgb.best_params,
        fold_idx
    )
    xgb_fold_accuracies.append(xgb_acc)
    xgb_oof_preds[val_idx] = xgb_preds
    print(f"XGBoost Fold {fold_idx}: {xgb_acc:.4f}")

    tabnet_model, tabnet_preds, tabnet_acc = train_tabnet(
        X_train, X_val, y_train, y_val,
        study_tabnet.best_params,
        fold_idx
    )
    tabnet_fold_accuracies.append(tabnet_acc)
    tabnet_oof_preds[val_idx] = tabnet_preds
    print(f"TabNet Fold {fold_idx}: {tabnet_acc:.4f}")


catboost_mean_accuracy = np.mean(catboost_fold_accuracies)
catboost_std_accuracy = np.std(catboost_fold_accuracies)
print(f"CatBoost : {catboost_mean_accuracy:.4f} (+/- {catboost_std_accuracy:.4f})")

lgb_mean_accuracy = np.mean(lgb_fold_accuracies)
lgb_std_accuracy = np.std(lgb_fold_accuracies)
print(f"LGBoost : {lgb_mean_accuracy:.4f} (+/- {lgb_std_accuracy:.4f})")

xgb_mean_accuracy = np.mean(xgb_fold_accuracies)
xgb_std_accuracy = np.std(xgb_fold_accuracies)
print(f"XGBoost : {xgb_mean_accuracy:.4f} (+/- {xgb_std_accuracy:.4f})")

tabnet_mean_accuracy = np.mean(tabnet_fold_accuracies)
tabnet_std_accuracy = np.std(tabnet_fold_accuracies)
print(f"Tabnet : {tabnet_mean_accuracy:.4f} (+/- {tabnet_std_accuracy:.4f})")