<a href="https://colab.research.google.com/github/noahfavreau/nasa-space-apps-2025/blob/main/model_architecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier, cv, Pool

from pytorch_tabnet.tab_model import TabNetClassifier

import lightgbm as lgb

from sklearn.linear_model import LogisticRegressionCV

import xgboost as xgb

from xgboost import plot_importance, XGBClassifier

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

import random

import numpy as np

import pickle

import optuna

RANDOM_SEED = 67

In [None]:
def train_catboost(X_train, X_val, y_train, y_val, Fold):
  catboost_model = CatBoostClassifier(
                           iterations=200,
                           task_type="GPU",
                           devices='0',
                           depth=6,
                           loss_function='MultiClass',
                           verbose=0,
                           eval_metric='MultiClass',
                           random_state=RANDOM_SEED + Fold
                          )

  catboost_train_pool = Pool(X_train,
                            y_train,
                             )

  catboost_val_pool = Pool(X_val,
                            y_val,
                             )

  catboost_model.fit(catboost_train_pool,
                     eval_set=catboost_val_pool,
                     early_stopping_rounds=20,
                     use_best_model=True)

  catboost_val_preds = catboost_model.predict(X_val)

  catboost_accuracy = accuracy_score(y_val, catboost_val_preds)
  catboost_report = classification_report(y_val, catboost_val_preds)

  catboost_fold_accuracies.append(catboost_accuracy)

  return catboost_model, catboost_val_preds, catboost_accuracy

In [None]:
from re import VERBOSE
def train_lgb(X_train, X_val, y_train, y_val, Fold):
  params = {
    'objective' : 'multiclass',
    'num_class' : 3,
    'metric' : "multi_logloss",
    'verbose' : -1
  }

  lgb_train_data = lgb.Dataset(X_train, label=y_train)
  lgb_val_data = lgb.Dataset(X_val, label=y_val, reference=lgb_train_data)

  lgb_model = lgb.train(params,
                      lgb_train_data,
                      200,
                      callbacks=[lgb.early_stopping(stopping_rounds=20)],
                      valid_sets=[lgb_val_data]
                        )

  lgb_val_preds_proba = lgb_model.predict(X_val,
                           num_iteration=lgb_model.best_iteration)

  lgb_val_preds = np.argmax(lgb_val_preds_proba, axis=1)


  lgb_accuracy = accuracy_score(y_val, lgb_val_preds)
  lgb_report = classification_report(y_val, lgb_val_preds)

  lgb_fold_accuracies.append(lgb_accuracy)

  return lgb_model, lgb_val_preds, lgb_accuracy

In [None]:
def train_xgb(X_train, X_val, y_train, y_val, Fold):
  xgb_model = XGBClassifier(num_classes=3,
                          objective='multi:softmax',
                          eval_metric='mlogloss',
                          use_label_encoder=False,
                          max_depth=4,
                          n_estimators=100,
                          random_state=RANDOM_SEED + Fold)

  xgb_model.fit(X_train, y_train)

  xgb_val_preds = xgb_model.predict(X_val)


  xgb_accuracy = accuracy_score(y_val, xgb_val_preds)
  xgb_report = classification_report(y_val, xgb_val_preds)

  xgb_fold_accuracies.append(xgb_accuracy)

  return xgb_model, xgb_val_preds, xgb_accuracy

In [None]:
X = None
y = None

catboost_fold_accuracies = []
xgb_fold_accuracies = []
lgb_fold_accuracies = []

catboost_oof_preds = np.zeros(len(X))
xgb_oof_preds = np.zeros(len(X))
lgb_oof_preds = np.zeros(len(X))

catboost_models = []
lgb_models = []
xgb_models = []

n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)

for Fold, (train_index, test_index) in enumerate(kf.split(X, y)):
  X_train, X_val = X.iloc[train_index], X.iloc[test_index]
  y_train, y_val = y.iloc[train_index], y.iloc[test_index]

  cb_model, cb_preds, cb_acc = train_catboost(X_train, X_val, y_train, y_val, Fold)
  catboost_oof_preds[test_index] = cb_preds
  catboost_fold_accuracies.append(cb_acc)
  catboost_models.append(cb_model)
  print(f"CatBoost Accuracy: {cb_acc:.4f}")

  xgb_model, xgb_preds, xgb_acc = train_xgb(X_train, X_val, y_train, y_val, Fold)
  xgb_oof_preds[test_index] = xgb_preds
  xgb_fold_accuracies.append(xgb_acc)
  xgb_models.append(xgb_model)
  print(f"XGBoost Accuracy: {xgb_acc:.4f}")

  lgb_model, lgb_preds, lgb_acc = train_lgb(X_train, X_val, y_train, y_val, Fold)
  lgb_oof_preds[test_index] = lgb_preds
  lgb_fold_accuracies.append(lgb_acc)
  lgb_models.append(lgb_model)
  print(f"LGBoost Accuracy: {lgb_acc:.4f}")

catboost_mean_accuracy = np.mean(catboost_fold_accuracies)
catboost_std_accuracy = np.std(catboost_fold_accuracies)
print(f"CatBoost : {catboost_mean_accuracy:.4f} (+/- {catboost_std_accuracy:.4f})")

lgb_mean_accuracy = np.mean(lgb_fold_accuracies)
lgb_std_accuracy = np.std(lgb_fold_accuracies)
print(f"LGBoost : {lgb_mean_accuracy:.4f} (+/- {lgb_std_accuracy:.4f})")

xgb_mean_accuracy = np.mean(xgb_fold_accuracies)
xgb_std_accuracy = np.std(xgb_fold_accuracies)
print(f"XGBoost : {xgb_mean_accuracy:.4f} (+/- {xgb_std_accuracy:.4f})")

In [None]:
def objective_catboost(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 500),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "task_type": "GPU",
        "devices": "0",
        "loss_function": "MultiClass",
        "random_seed": RANDOM_SEED
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracies = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        model = CatBoostClassifier(**params, verbose=0)
        model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=20, verbose=0)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        accuracies.append(acc)

    return np.mean(accuracies)

study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(objective_catboost, n_trials=30)
print("Best CatBoost params:", study_cat.best_params)


In [None]:
def objective_lgb(trial):
    params = {
        "objective": "multiclass",
        "num_class": 3,
        "metric": "multi_logloss",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 128),
        "max_depth": trial.suggest_int("max_depth", -1, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracies = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val)
        model = lgb.train(params, train_data, valid_sets=[val_data], early_stopping_rounds=20, verbose_eval=False)

        preds = np.argmax(model.predict(X_val), axis=1)
        acc = accuracy_score(y_val, preds)
        accuracies.append(acc)

    return np.mean(accuracies)

study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=30)
print("Best LightGBM params:", study_lgb.best_params)


In [None]:
def objective_xgb(trial):
    params = {
        "objective": "multi:softmax",
        "num_class": 3,
        "eval_metric": "mlogloss",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 50, 300),
        "random_state": RANDOM_SEED
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
    accuracies = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[valid_idx]

        model = XGBClassifier(**params, use_label_encoder=False)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        accuracies.append(acc)

    return np.mean(accuracies)

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=30)
print("Best XGBoost params:", study_xgb.best_params)
