In [1]:
import numpy
import sys, os
from catboost import CatBoostRegressor, cv, Pool, CatBoostClassifier
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import numpy as np
import joblib as jl
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OrdinalEncoder,
    LabelBinarizer,
    OneHotEncoder,
    StandardScaler,
)
from sklearn.metrics import matthews_corrcoef

from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.model_selection import train_test_split
import random
from datetime import datetime

# helpers
sys.path.append("..")
from helpers.loss_functions import (
    update_submission,
    finalize_submission
)

now = int(datetime.now().timestamp())
SEED = 108
N_FOLDS = 5
random.seed(SEED)
train_path = "/home/manpm/Developers/kaggle/data/mushrooms/train.csv"
test_path = "/home/manpm/Developers/kaggle/data/mushrooms/test.csv"

In [2]:
def handle_categorical_columns(df: pd.DataFrame, n_common_values=15):
    categorical_cols = df.select_dtypes(include="object").columns.to_list()
    # get top 10 most frequent names
    n = n_common_values
    for c in categorical_cols:
        train_mode_values = df[c].value_counts()[:n].index.tolist()
        df.loc[~df[c].isin(train_mode_values), c] = "other"
        df[c] = pd.Series(df[c], dtype="category")
        gc.collect()
    return df, categorical_cols

In [3]:
# Prepare data
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
X_test = pd.read_csv(test_path)
print(f"test size: {X_test.shape}")
X_test.drop(columns=["id"], inplace=True)

# prepare columns
target = "class"

X_train = train.drop(columns=[target, "id"], axis=1)
y_train = train[target]
# Binarize the target labels
lb = LabelBinarizer()

y_train = lb.fit_transform(y_train)

# X_train, X_val, y_train, y_val = train_test_split(
#     X_train, y_train, test_size=0.2, random_state=42
# )


# Category handling
X_train, categorical_training_cols = handle_categorical_columns(X_train)
X_test, categorical_test_cols = handle_categorical_columns(X_test)
# X_val, categorical_val_cols = handle_categorical_columns(X_val)
# test_pool = Pool(
#     X_test,
#     cat_features=categorical_test_cols,
# )
gc.collect()

train size: (3116945, 22)
test size: (2077964, 21)


0

In [4]:
lb.classes_.tolist()

['e', 'p']

In [5]:
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold

gc.collect()
skf = StratifiedKFold(n_splits=N_FOLDS)

y_preds = []
y_trues = []
X = X_train.to_numpy()
for train_index, test_index in tqdm(skf.split(X, y_train)):
    X_train_splitted, X_test_splitted = (
        X_train.loc[train_index],
        X_train.loc[test_index],
    )
    y_train_splitted, y_test_splitted = y_train[train_index], y_train[test_index]

    train_pool = Pool(
        X_train_splitted,
        label=y_train_splitted,
        cat_features=categorical_training_cols,
    )
    val_pool = Pool(
        X_test_splitted,
        label=y_test_splitted,
        cat_features=categorical_training_cols,
    )
    params = {
        "loss_function": "Logloss",
        "iterations": 10000,
        "min_data_in_leaf": 12,
        "task_type": "GPU",
        "depth": 14,
        "l2_leaf_reg": 5.923260155898824,
        "learning_rate": 0.09535157906155059,
        "bagging_temperature": 0.270792508445483,
        "random_strength": 0.8745583257807371,
    }

    model = CatBoostClassifier(**params)
    # train the model
    model.fit(
        train_pool,
        use_best_model=True,
        eval_set=val_pool,
        metric_period=100,
        early_stopping_rounds=50,
    )
    y_pred = model.predict(val_pool)
    y_preds.append(y_pred)
    y_trues.append(y_test_splitted)
# Concatenate the predictions and true labels
y_preds_concat = np.concatenate(y_preds)
y_trues_concat = np.concatenate(y_trues)
jl.dump(model, f"catboost_clf_{now}.pkl")

0it [00:00, ?it/s]

0:	learn: 0.6817333	test: 0.6817446	best: 0.6817446 (0)	total: 28.6ms	remaining: 2.83s
99:	learn: 0.1136983	test: 0.1126687	best: 0.1126687 (99)	total: 2.67s	remaining: 0us
bestTest = 0.1126687188
bestIteration = 99


1it [00:06,  6.27s/it]

0:	learn: 0.6815326	test: 0.6815224	best: 0.6815224 (0)	total: 26.3ms	remaining: 2.6s
99:	learn: 0.1158247	test: 0.1151328	best: 0.1151328 (99)	total: 2.66s	remaining: 0us
bestTest = 0.1151328032
bestIteration = 99


2it [00:12,  6.08s/it]


['catboost_clf_1723952826.pkl']

In [6]:
mcc = matthews_corrcoef(y_trues_concat, y_preds_concat)
print(f"Validation mcc score: {mcc}")

Validation mcc score: 0.9609499372475219


In [7]:
update_submission(
    model=model,
    X_test=X_test,
    unique_target_values=lb.classes_.tolist(),
    categorical_cols=categorical_test_cols,
)
finalize_submission(strategy="mean")