In [1]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import polars as pl
import pandas as pd
import numpy as np

import optuna
from optuna.integration import CatBoostPruningCallback

pl.Config().set_tbl_cols(128)
pl.Config().set_tbl_rows(50)

polars.config.Config

In [20]:
base = "/home/michael/Datasets/playground-series-s4e3"
# base = "/kaggle/input/playground-series-s4e3"

In [18]:
data = pd.read_csv(f"{base}/train.csv")
data.sample(10)

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
10429,10429,1057,1084,1466087,1466141,979,118,79,118187,107,...,-0.3333,-0.1059,1.0,0,0,0,0,0,1,0
11543,11543,0,9,424082,424112,109,12,29,14471,103,...,0.7188,-0.0955,0.2712,0,0,0,0,0,0,1
1033,1033,559,590,468317,468520,145,23,27,16908,91,...,0.7308,-0.1263,0.5096,0,0,0,0,0,0,1
12666,12666,39,58,776120,776424,117,44,27,13961,49,...,-0.6471,-0.1885,0.9999,0,0,1,0,0,0,0
10789,10789,899,907,1292292,1292296,11,3,3,1398,112,...,-0.5,0.0007,0.1307,0,0,0,0,0,0,0
13456,13456,619,630,2046094,2046183,699,43,67,71112,88,...,0.8065,-0.2081,0.9964,0,0,0,0,0,0,1
9848,9848,983,1005,3283148,3283163,60,25,12,6761,80,...,-0.5667,-0.1832,0.2288,0,0,0,0,0,1,0
15343,15343,1110,1118,909876,909876,120,23,19,12235,79,...,0.7222,-0.2424,0.3509,0,0,0,0,0,0,1
3210,3210,756,780,679386,679391,267,21,26,26356,79,...,0.2857,-0.2745,0.5461,0,0,0,0,0,0,1
120,120,19,24,143523,143544,76,9,12,12593,107,...,0.6,-0.0683,0.2432,0,0,0,0,0,0,1


In [22]:
# Separate features and labels
X = data.drop(["id", "Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"], axis=1)
y = data[["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]]

# Split the data into train and test sets
X_optuna, X_test, y_optuna, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [26]:
def objective(trial):

    X_opt, X_val, y_opt, y_val = train_test_split(X_optuna, y_optuna, test_size=0.1, random_state=42)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "task_type": "CPU",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    models = {}
    for column in y_opt.columns:
        model = CatBoostClassifier(**param)
        model.fit(X_opt, y_opt[column], verbose=0)
        models[column] = model

    scores = {}
    for column, model in models.items():
        prediction = model.predict(X_val)
        try:
            scores[column] = roc_auc_score(prediction, y_val[column])
        except ValueError:
            print(f"ROC AUC score cannot be computed for column {column} due to single class presence. Skipping this column.")

    metric = sum(scores.values()) / len(scores)
    print(f"scores:{scores}\nAverage Score:{metric}")

    return metric

In [27]:
study = optuna.create_study(direction="maximize")
study.optimize(
    objective,
    n_trials=100,
    timeout=600,
)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")

for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-03-08 16:25:55,226] A new study created in memory with name: no-name-f8277637-097c-48cc-ae25-930ea53cb3ed
[I 2024-03-08 16:26:37,726] Trial 0 finished with value: 0.7729290474671845 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.04221436803286595, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.7729290474671845.


scores:{'Pastry': 0.5657886241849054, 'Z_Scratch': 0.8534778368515431, 'K_Scatch': 0.9336279754981778, 'Stains': 0.8933748784584108, 'Dirtiness': 0.7872093023255814, 'Bumps': 0.7125398897592109, 'Other_Faults': 0.6644848251924624}
Average Score:0.7729290474671845


[I 2024-03-08 16:27:08,782] Trial 1 finished with value: 0.7684955791607022 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.03666960479462609, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.5189766072148841}. Best is trial 0 with value: 0.7729290474671845.


scores:{'Pastry': 0.537799690641918, 'Z_Scratch': 0.8668928086838535, 'K_Scatch': 0.9315955554615708, 'Stains': 0.8861329715061058, 'Dirtiness': 0.8205500677900445, 'Bumps': 0.691993792253364, 'Other_Faults': 0.6445041677880587}
Average Score:0.7684955791607022


[I 2024-03-08 16:31:33,590] Trial 2 finished with value: 0.7742402670569761 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.09906204598249962, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.1763219723026431}. Best is trial 2 with value: 0.7742402670569761.


scores:{'Pastry': 0.609000492809462, 'Z_Scratch': 0.8320859441479075, 'K_Scatch': 0.9336279754981778, 'Stains': 0.8947144204728722, 'Dirtiness': 0.8056745465122428, 'Bumps': 0.7015228065181812, 'Other_Faults': 0.643055683439989}
Average Score:0.7742402670569761


[I 2024-03-08 16:31:46,408] Trial 3 finished with value: 0.7796419705535185 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.08060451307949953, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 3 with value: 0.7796419705535185.


scores:{'Pastry': 0.6400842843520256, 'Z_Scratch': 0.8348092286304644, 'K_Scatch': 0.9291608391608391, 'Stains': 0.8731546326945555, 'Dirtiness': 0.8205500677900445, 'Bumps': 0.7060346450762569, 'Other_Faults': 0.6537000961704437}
Average Score:0.7796419705535185


[I 2024-03-08 16:32:21,748] Trial 4 finished with value: 0.7706579236574687 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.03193003111531745, 'depth': 2, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 3 with value: 0.7796419705535185.


scores:{'Pastry': 0.537799690641918, 'Z_Scratch': 0.8411778374470723, 'K_Scatch': 0.9321289977878595, 'Stains': 0.9086939580086183, 'Dirtiness': 0.8205500677900445, 'Bumps': 0.6896817716157613, 'Other_Faults': 0.6645731423110061}
Average Score:0.7706579236574687


[I 2024-03-08 16:32:53,972] Trial 5 finished with value: 0.7706861420078003 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.051971343718410314, 'depth': 3, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.6643909595861944}. Best is trial 3 with value: 0.7796419705535185.


scores:{'Pastry': 0.5544927536231884, 'Z_Scratch': 0.8521225008719524, 'K_Scatch': 0.9336279754981778, 'Stains': 0.890552741686853, 'Dirtiness': 0.8056745465122428, 'Bumps': 0.6992822966507177, 'Other_Faults': 0.6590501792114695}
Average Score:0.7706861420078003


[I 2024-03-08 16:33:40,268] Trial 6 finished with value: 0.7626759601811648 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.08811667735303423, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 3 with value: 0.7796419705535185.


scores:{'Pastry': 0.5548104956268222, 'Z_Scratch': 0.8421284541353646, 'K_Scatch': 0.9321289977878595, 'Stains': 0.8633403592663668, 'Dirtiness': 0.7872093023255814, 'Bumps': 0.7100407278832845, 'Other_Faults': 0.6490733842428758}
Average Score:0.7626759601811648


[I 2024-03-08 16:33:50,667] Trial 7 finished with value: 0.7720765166352558 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.04535469699657453, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7334860166997565}. Best is trial 3 with value: 0.7796419705535185.


scores:{'Pastry': 0.6049418604651163, 'Z_Scratch': 0.8571787731105438, 'K_Scatch': 0.9361830265081925, 'Stains': 0.881481947546691, 'Dirtiness': 0.7872093023255814, 'Bumps': 0.6915038407040047, 'Other_Faults': 0.6460368657866608}
Average Score:0.7720765166352558


[I 2024-03-08 16:35:09,807] Trial 8 finished with value: 0.7773216562278753 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.05980820250101397, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9428026808507658}. Best is trial 3 with value: 0.7796419705535185.


scores:{'Pastry': 0.6218471090415211, 'Z_Scratch': 0.8374568577430972, 'K_Scatch': 0.9346128860804346, 'Stains': 0.8947144204728722, 'Dirtiness': 0.7954616728641191, 'Bumps': 0.7072038009150567, 'Other_Faults': 0.6499548464780253}
Average Score:0.7773216562278753


[I 2024-03-08 16:36:17,909] Trial 9 finished with value: 0.7729251120223106 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.04998823199869843, 'depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7529759619847901}. Best is trial 3 with value: 0.7796419705535185.


scores:{'Pastry': 0.609000492809462, 'Z_Scratch': 0.844979851195017, 'K_Scatch': 0.9321289977878595, 'Stains': 0.8670708594414791, 'Dirtiness': 0.8205500677900445, 'Bumps': 0.6951565693330612, 'Other_Faults': 0.6415889457992507}
Average Score:0.7729251120223106
Number of finished trials: 10
Best trial:
  Value: 0.7796419705535185
  Params: 
    objective: Logloss
    colsample_bylevel: 0.08060451307949953
    depth: 4
    boosting_type: Plain
    bootstrap_type: MVS
