In [1]:
import numpy
import sys, os
from catboost import CatBoostRegressor, cv, Pool, CatBoostClassifier
import polars as pl
import pandas as pd

# import seaborn as sns
import matplotlib.pyplot as plt
import gc
import numpy as np
import joblib as jl
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OrdinalEncoder,
    LabelBinarizer,
    OneHotEncoder,
    StandardScaler,
)
from sklearn.metrics import matthews_corrcoef

from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.model_selection import train_test_split
import random
from datetime import datetime

# helpers
sys.path.append("..")
from helpers.loss_functions import (
    update_submission,
    finalize_submission,
    handle_categorical_columns,
)

now = int(datetime.now().timestamp())
SEED = 108
N_FOLDS = 10
early_stop = 50
iterations = 200
random.seed(SEED)
is_dshub = os.getenv("NB_USER", False)
if is_dshub:
    train_path = "../../data/train.csv"
    test_path = "../../data/test.csv"
else:
    train_path = "/home/manpm/Developers/kaggle/data/mushrooms/train.csv"
    test_path = "/home/manpm/Developers/kaggle/data/mushrooms/test.csv"

In [2]:
# Prepare data

train = pl.scan_parquet("../train_fold.parquet")
categorical_cols = train.select(pl.col(pl.Utf8)).columns
categorical_cols.pop(0)
gc.collect()
categorical_cols

['cap-shape',
 'cap-surface',
 'cap-color',
 'does-bruise-or-bleed',
 'gill-attachment',
 'gill-spacing',
 'gill-color',
 'stem-root',
 'stem-surface',
 'stem-color',
 'veil-type',
 'veil-color',
 'has-ring',
 'ring-type',
 'spore-print-color',
 'habitat',
 'season']

## predict

In [3]:
# train_pool = Pool(
#     X_train_splitted,
#     label=y_train_splitted,
#     cat_features=categorical_training_cols,
# )
# val_pool = Pool(
#     X_test_splitted,
#     label=y_test_splitted,
#     cat_features=categorical_training_cols,
# )
params = {
    "loss_function": "Logloss",
    "iterations": iterations,
    "min_data_in_leaf": 12,
    "task_type": "GPU",
    "depth": 14,
    "l2_leaf_reg": 5.923260155898824,
    "learning_rate": 0.09535157906155059,
    "bagging_temperature": 0.270792508445483,
    "random_strength": 0.8745583257807371,
}

# model = CatBoostClassifier(**params)
# # train the model
# model.fit(
#     train_pool,
#     use_best_model=True,
#     eval_set=val_pool,
#     metric_period=100,
#     early_stopping_rounds=50,
# )
# y_pred = model.predict(val_pool)

In [4]:
y_preds = []
y_trues = []
for idx in range(N_FOLDS):
    X_train = (
        train.filter(pl.col("fold") != idx)
        .drop(["id", "class"])
        .fill_null("other")
        .drop(["fold"])
        .collect()
        .to_pandas()
    )
    y_train = (
        train.filter(pl.col("fold") != idx).select(pl.col("class")).collect().to_numpy()
    )

    X_test = (
        train.filter(pl.col("fold") == idx)
        .drop(["id", "class"])
        .fill_null("other")
        .drop(["fold"])
        .collect()
        .to_pandas()
    )
    y_test = (
        train.filter(pl.col("fold") == idx).select(pl.col("class")).collect().to_numpy()
    )

    train_pool = Pool(
        X_train,
        label=y_train,
        cat_features=categorical_cols,
    )
    val_pool = Pool(
        X_test,
        label=y_test,
        cat_features=categorical_cols,
    )

    model = CatBoostClassifier(**params)
    # train the model
    model.fit(
        train_pool,
        use_best_model=True,
        eval_set=val_pool,
        metric_period=50,
        early_stopping_rounds=early_stop,
        plot=True,
    )
    y_pred = model.predict(val_pool)
    y_preds.append(y_pred)
    y_trues.append(y_test)
    gc.collect()
# Concatenate the predictions and true labels
y_preds_concat = np.concatenate(y_preds)
y_trues_concat = np.concatenate(y_trues)
jl.dump(model, f"catboost_clf_{now}.pkl")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.5139629	test: 0.5134956	best: 0.5134956 (0)	total: 2.48s	remaining: 8m 13s
50:	learn: 0.0373059	test: 0.0390976	best: 0.0390976 (50)	total: 2m 13s	remaining: 6m 29s
100:	learn: 0.0351725	test: 0.0376941	best: 0.0376941 (100)	total: 4m 7s	remaining: 4m 2s
150:	learn: 0.0344083	test: 0.0373642	best: 0.0373642 (150)	total: 5m 58s	remaining: 1m 56s


In [None]:
mcc = matthews_corrcoef(y_trues_concat, y_preds_concat)
print(f"Validation mcc score: {mcc}")

In [None]:
model = jl.load("catboost_clf_1724155392.pkl")
X_test = (
    pl.scan_csv(test_path)
    .drop(
        [
            "id",
            "class",
        ]
    )
    .fill_null("other")
    .collect()
    .to_pandas()
)
update_submission(
    model=model,
    X_test=X_test,
    unique_target_values=["e", "p"],
    categorical_cols=categorical_cols,
)
finalize_submission(strategy="mean")