In [None]:
# Installing packages and setting up environment

!pip install -q --upgrade pip
!pip install -q scikit-learn==1.6.1
!pip install -q xgboost lightgbm catboost
!pip install -q mlxtend

In [None]:
# Importing libraries

import warnings
warnings.filterwarnings('ignore')

import os
import pandas as pd
import numpy as np
from pathlib import Path


from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.calibration import CalibratedClassifierCV
from joblib import Parallel, delayed

# Boosted libraries

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
# Designating pathnames

TRAIN_PATH = "/content/aluminum_coldRoll_train.csv"
TEST_PATH = "/content/aluminum_coldRoll_testNoY.csv"
OUTPUT_PATH = "/content/Team17_Submission.csv"

In [None]:
# Loading .csv files

df_train = pd.read_csv("/content/aluminum_coldRoll_train.csv")
df_test = pd.read_csv("/content/aluminum_coldRoll_testNoY.csv")

In [None]:
# Identify categorical and numeric columns/predictors

TARGET = "y_passXtremeDurability"
ID_col = 'ID'

X = df_train.drop(columns=[TARGET]).copy()
y = df_train[TARGET].copy()
X_test = df_test.copy()

cat_cols = X.select_dtypes(include=[object, 'category']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

In [None]:
# Setting up OneHotEncoder for tree boosters

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(transformers=[
('num', num_imputer, num_cols),
('cat', Pipeline([('imp', cat_imputer), ('ohe', ohe)]), cat_cols)
], remainder='drop')

preprocessor.fit(X)

# Transforming datasets for models that need OneHotEncoder for categorical variables

X_ohe = preprocessor.transform(X)
X_test_ohe = preprocessor.transform(X_test)

In [None]:
# Getting out of fold train predictions and test predictions averaged over folds
def get_oof_predictions(clf, X, y, X_test, n_splits=10, random_state=42, use_proba=True, fit_params=None):

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_train = np.zeros(X.shape[0])
    oof_test = np.zeros((X_test.shape[0], n_splits))

    for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        # Fitting
        if fit_params:
            clf.fit(X_tr, y_tr, **fit_params)
        else:
            clf.fit(X_tr, y_tr)

        # Predicting
        if use_proba and hasattr(clf, 'predict_proba'):
            oof_train[val_idx] = clf.predict_proba(X_val)[:, 1]
            oof_test[:, i] = clf.predict_proba(X_test)[:, 1]
        elif hasattr(clf, 'decision_function'):
            oof_train[val_idx] = clf.decision_function(X_val)
            oof_test[:, i] = clf.decision_function(X_test)
        else:
            oof_train[val_idx] = clf.predict(X_val)
            oof_test[:, i] = clf.predict(X_test)

        print(f"Fold {i+1}/{n_splits} done.")

    # Averaging test predictions
    oof_test_mean = oof_test.mean(axis=1)

    return oof_train, oof_test_mean

In [None]:
# Designating parameters for models

cat_features_idx = [X.columns.get_loc(c) for c in cat_cols] if len(cat_cols) > 0 else []

cat_params = dict(
iterations=1200,
learning_rate=0.03,
depth=8,
eval_metric='AUC',
random_seed=42,
verbose=0,
early_stopping_rounds=100
)

lgbm_params = dict(
n_estimators=1200,
learning_rate=0.03,
num_leaves=64,
max_depth=-1,
subsample=0.8,
colsample_bytree=0.8,
class_weight='balanced',
random_state=42
)

xgb_params = dict(
n_estimators=1200,
learning_rate=0.03,
max_depth=8,
subsample=0.8,
colsample_bytree=0.8,
use_label_encoder=False,
eval_metric='logloss',
random_state=42,
tree_method='hist'
)

rf_params = dict(
n_estimators=500,
max_depth=12,
min_samples_leaf=2,
class_weight='balanced',
random_state=42
)


# Initializing models

cat_model = CatBoostClassifier(**cat_params)
lgbm_model = LGBMClassifier(**lgbm_params)
xgb_model = XGBClassifier(**xgb_params)
rf_model = RandomForestClassifier(**rf_params)

In [None]:
# Implementation

def fit(self, X, y, **kwargs):
  self.model.fit(X, y, **kwargs)

def predict_proba(self, X):
  return self.model.predict_proba(X)

def get_oof_catboost(cat_model, X_df, y, X_test_df, n_splits=10, random_state=42):
  skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
  oof = np.zeros(X_df.shape[0])
  oof_test = np.zeros((X_test_df.shape[0], n_splits))

  for i, (train_idx, val_idx) in enumerate(skf.split(X_df, y)):
    X_tr = X_df.iloc[train_idx]
    y_tr = y.iloc[train_idx]
    X_val = X_df.iloc[val_idx]

    cat_model.fit(X_tr, y_tr, cat_features=cat_cols, eval_set=(X_val, y.iloc[val_idx]))
    oof[val_idx] = cat_model.predict_proba(X_val)[:, 1]
    oof_test[:, i] = cat_model.predict_proba(X_test_df)[:, 1]

  return oof, oof_test.mean(axis=1)

In [None]:
# Setting up predictions from base models

X_ohe_arr = np.array(X_ohe)
X_test_ohe_arr = np.array(X_test_ohe)
y_arr = np.array(y)

In [None]:
# Run LGBM OOF

lgbm_oof_train, lgbm_oof_test = get_oof_predictions(
    lgbm_model, X_ohe_arr, y_arr, X_test_ohe_arr, n_splits=10
)
print(roc_auc_score(y_arr, lgbm_oof_train))

# Run XGB OOF

xgb_oof_train, xgb_oof_test = get_oof_predictions(
    xgb_model, X_ohe_arr, y_arr, X_test_ohe_arr, n_splits=10
)
print(roc_auc_score(y_arr, xgb_oof_train))

# Run RandomForest OOF

rf_oof_train, rf_oof_test = get_oof_predictions(
    rf_model, X_ohe_arr, y_arr, X_test_ohe_arr, n_splits=10
)
print(roc_auc_score(y_arr, rf_oof_train))


In [None]:
# Run CatBoost OOF

cat_oof_train, cat_oof_test = get_oof_catboost(cat_model, X, y, X_test, n_splits=10)
print(roc_auc_score(y, cat_oof_train))

In [None]:
# Stacking dataset

stack_train = np.vstack([cat_oof_train, lgbm_oof_train, xgb_oof_train, rf_oof_train]).T
stack_test = np.vstack([cat_oof_test, lgbm_oof_test, xgb_oof_test, rf_oof_test]).T

In [None]:
# Calibrating meta learner

meta_clf = LogisticRegression(max_iter=2000)
meta_clf.fit(stack_train, y)
meta_train_pred = meta_clf.predict_proba(stack_train)[:, 1]
print('Meta-train AUC:', roc_auc_score(y, meta_train_pred))

calibrator = CalibratedClassifierCV(
    estimator=LogisticRegression(max_iter=2000),
    cv=3,
    method='isotonic'
)

calibrator.fit(stack_train, y)
meta_calibrated_pred = calibrator.predict_proba(stack_train)[:, 1]
print('Meta (calibrated) train AUC:', roc_auc_score(y, meta_calibrated_pred))

# Use calibrated meta predictions on test

meta_test_pred = calibrator.predict_proba(stack_test)[:, 1]
meta_test_pred = np.clip(meta_test_pred, 1e-6, 1 - 1e-6)

In [None]:
submission = pd.DataFrame({
'ID': X_test[ID_col],
'y_passXtremeDurability': meta_test_pred
})

submission.to_csv(OUTPUT_PATH, index=False)
print(f"Saved submission to {OUTPUT_PATH}")