In [17]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

import polars as pl
import pandas as pd
import numpy as np

import optuna
from optuna.integration import CatBoostPruningCallback

pl.Config().set_tbl_cols(128)
pl.Config().set_tbl_rows(50)

polars.config.Config

In [3]:
base = "/home/michael/Datasets/playground-series-s4e3"
# base = "/kaggle/input/playground-series-s4e3"

In [4]:
data = pd.read_csv(f"{base}/train.csv")

In [5]:
labels = ["Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults"]
df = pl.DataFrame(data)

print(
    df.group_by(labels)
    .agg(
        [
            pl.len().alias("Len"),
        ]
    )
    .with_columns([pl.sum_horizontal(labels).alias("True_Label"), pl.concat_str(labels).alias("Concated_Label")])
    .sort(by=["Len"])
)

shape: (11, 10)
┌────────┬──────────┬──────────┬────────┬──────────┬───────┬──────────┬──────┬──────────┬──────────┐
│ Pastry ┆ Z_Scratc ┆ K_Scatch ┆ Stains ┆ Dirtines ┆ Bumps ┆ Other_Fa ┆ Len  ┆ True_Lab ┆ Concated │
│ ---    ┆ h        ┆ ---      ┆ ---    ┆ s        ┆ ---   ┆ ults     ┆ ---  ┆ el       ┆ _Label   │
│ i64    ┆ ---      ┆ i64      ┆ i64    ┆ ---      ┆ i64   ┆ ---      ┆ u32  ┆ ---      ┆ ---      │
│        ┆ i64      ┆          ┆        ┆ i64      ┆       ┆ i64      ┆      ┆ i64      ┆ str      │
╞════════╪══════════╪══════════╪════════╪══════════╪═══════╪══════════╪══════╪══════════╪══════════╡
│ 1      ┆ 0        ┆ 1        ┆ 0      ┆ 0        ┆ 0     ┆ 0        ┆ 1    ┆ 2        ┆ 1010000  │
│ 0      ┆ 0        ┆ 1        ┆ 0      ┆ 0        ┆ 1     ┆ 0        ┆ 2    ┆ 2        ┆ 0010010  │
│ 0      ┆ 0        ┆ 1        ┆ 0      ┆ 0        ┆ 0     ┆ 1        ┆ 18   ┆ 2        ┆ 0010001  │
│ 0      ┆ 0        ┆ 0        ┆ 0      ┆ 1        ┆ 0     ┆ 0        ┆ 485

In [6]:
df = df.with_columns([pl.concat_str(labels).alias("Concated_Label")]).filter(
    [(pl.col("Concated_Label") != "1010000") & (pl.col("Concated_Label") != "0010010")]
)
data = df.to_pandas()

In [21]:
# Separate features and labels
X = data.drop(["id", "Pastry", "Z_Scratch", "K_Scatch", "Stains", "Dirtiness", "Bumps", "Other_Faults", "Concated_Label"], axis=1)
y = data[["Concated_Label"]]

# Split the data into train and test sets
X_optuna, X_test, y_optuna, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [24]:
def objective(trial):

    X_opt, X_val, y_opt, y_val = train_test_split(X_optuna, y_optuna, test_size=0.1, stratify=y_optuna, random_state=42)

    encoder = LabelEncoder()
    y_opt = encoder.fit_transform(y_opt.values.ravel())
    y_val = encoder.transform(y_val.values.ravel())

    param = {
        "objective": "MultiClass",
        "learning_rate": trial.suggest_categorical(
            "learning_rate",
            [0.1, 0.01, 0.125, 0.005],
        ),
        "iterations": trial.suggest_categorical("iterations", [200, 500, 1000, 2000]),
        "max_depth": trial.suggest_categorical("max_depth", [8, 10, 12, 16]),
        "task_type": "GPU",
    }

    model = CatBoostClassifier(**param)
    model.fit(X_opt, y_opt, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=100)

    prediction_proba = model.predict_proba(X_val)
    print(prediction_proba)
    return roc_auc_score(y_val, prediction_proba, multi_class="ovr", average="weighted")

In [25]:
study = optuna.create_study(direction="maximize")
study.optimize(
    objective,
    n_trials=30,
    timeout=600,
)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")

for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-03-09 14:40:53,189] A new study created in memory with name: no-name-ecb346a3-ea74-46a6-b315-b5774abbc9f4


0:	learn: 1.9945551	total: 127ms	remaining: 4m 14s
100:	learn: 0.6418501	total: 12.4s	remaining: 3m 53s
200:	learn: 0.4201045	total: 25s	remaining: 3m 43s
300:	learn: 0.2979044	total: 37.2s	remaining: 3m 29s
400:	learn: 0.2245398	total: 49.3s	remaining: 3m 16s
500:	learn: 0.1791184	total: 1m 1s	remaining: 3m 2s
600:	learn: 0.1452307	total: 1m 13s	remaining: 2m 50s
700:	learn: 0.1214090	total: 1m 25s	remaining: 2m 38s
800:	learn: 0.1026538	total: 1m 37s	remaining: 2m 25s
900:	learn: 0.0877505	total: 1m 49s	remaining: 2m 13s
1000:	learn: 0.0766869	total: 2m 1s	remaining: 2m 1s
1100:	learn: 0.0675605	total: 2m 13s	remaining: 1m 49s
1200:	learn: 0.0601712	total: 2m 25s	remaining: 1m 36s
1300:	learn: 0.0541783	total: 2m 37s	remaining: 1m 24s
1400:	learn: 0.0492516	total: 2m 49s	remaining: 1m 12s
1500:	learn: 0.0448442	total: 3m 1s	remaining: 1m
1600:	learn: 0.0413596	total: 3m 13s	remaining: 48.2s
1700:	learn: 0.0381796	total: 3m 25s	remaining: 36.1s
1800:	learn: 0.0353035	total: 3m 37s	rem

[I 2024-03-09 14:44:57,113] Trial 0 finished with value: 0.807631960703552 and parameters: {'learning_rate': 0.1, 'iterations': 2000, 'max_depth': 12}. Best is trial 0 with value: 0.807631960703552.


[[3.02015068e-03 2.39178507e-02 4.48476703e-04 ... 8.43228137e-05
  2.89163913e-04 1.23394998e-04]
 [4.00098073e-04 2.33524932e-03 3.74923528e-06 ... 5.56968351e-07
  2.70944539e-06 6.62141811e-07]
 [1.84155433e-03 6.95947859e-01 2.89058435e-01 ... 2.26364876e-05
  1.76930812e-03 1.56251415e-04]
 ...
 [3.58387819e-03 6.39793773e-01 3.53858427e-01 ... 6.16202678e-05
  8.03232731e-04 4.52593616e-04]
 [3.82331502e-04 4.57234999e-01 2.55940390e-01 ... 7.02079781e-06
  1.88139378e-04 2.86040749e-01]
 [5.94610003e-02 5.33103171e-01 1.62614682e-01 ... 2.83166465e-04
  8.82961857e-03 3.63576738e-02]]


: 