In [1]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import polars as pl
import pandas as pd
import numpy as np

import optuna
from optuna.integration import CatBoostPruningCallback

pl.Config().set_tbl_cols(128)
pl.Config().set_tbl_rows(50)

polars.config.Config

In [2]:
data = pl.read_csv("/kaggle/input/playground-series-s4e3/train.csv")
submission_test = pl.read_csv("/kaggle/input/playground-series-s4e3/test.csv")

In [3]:
data = pl.read_csv("/kaggle/input/playground-series-s4e3/train.csv").to_pandas()
submission_test = pl.read_csv("/kaggle/input/playground-series-s4e3/test.csv").to_pandas()

# Separate features and labels
X = data.drop(['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
y = data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

# Split the data into train and test sets
X_optuna, X_test, y_optuna, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def objective(trial):
    
    X_opt, X_val, y_opt, y_val = train_test_split(X_optuna, 
                                                  y_optuna, test_size=0.2, random_state=42)
    
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        
    
    models = {}
    for column in y_opt.columns:
        model = CatBoostClassifier(**param)
        model.fit(X_opt, y_opt[column],verbose=0)
        models[column] = model
        
    scores = {}
    for column, model in models.items():
        prediction = model.predict(X_val)
        try:
            scores[column] = roc_auc_score(prediction, y_val[column])
        except ValueError:
            print(f"ROC AUC score cannot be computed for column {column} due to single class presence. Skipping this column.")

        
    
    metric = sum(scores.values()) / len(scores)
    print(f"scores:{scores}\nAverage Score:{metric}")
   
    return metric
    


In [5]:
#study = optuna.create_study(direction="maximize")
#study.optimize(objective, n_trials=100, timeout=600,)

#print("Number of finished trials: {}".format(len(study.trials)))

#print("Best trial:")
#trial = study.best_trial

#print("  Value: {}".format(trial.value))

#print("  Params: ")

#for key, value in trial.params.items():
#    print("    {}: {}".format(key, value))