In [1]:
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import numpy as np
import joblib as jl
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OrdinalEncoder,
    LabelBinarizer,
    OneHotEncoder,
    StandardScaler,
)
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.model_selection import train_test_split
import random

SEED = 108
random.seed(SEED)
train_path = "/home/manpm/Developers/kaggle/data/mushrooms/train.csv"
test_path = "/home/manpm/Developers/kaggle/data/mushrooms/test.csv"

In [2]:
# Prepare data
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
X_test = pd.read_csv(test_path)
print(f"test size: {X_test.shape}")
submit_df = pd.DataFrame()
submit_df["id"] = X_test["id"]
y_train = train["class"]
X_train = train.drop(columns=["id", "class"], axis=1)
X_test.drop(columns=["id"], inplace=True, axis=1)

# prepare columns
target = "class"

categorical_cols = (
    train.drop(columns=target).select_dtypes(include="object").columns.to_list()
)


numerical_cols = (
    train.drop(columns="id").select_dtypes(include="number").columns.to_list()
)
gc.collect()

print("Preprocessing...")
# get top 10 most frequent names
n = 15
for c in categorical_cols:
    train_mode_values = X_train[c].value_counts()[:n].index.tolist()
    X_train.loc[~X_train[c].isin(train_mode_values), c] = "other"
    test_mode_values = X_test[c].value_counts()[:n].index.tolist()
    X_test.loc[~X_test[c].isin(test_mode_values), c] = "other"
    X_train[c] = pd.Series(X_train[c], dtype="category")
    X_test[c] = pd.Series(X_test[c], dtype="category")
    gc.collect()

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)
# Encode the target variable
lb = LabelBinarizer()
# Preprocessing for DMatrix
lb.fit(np.concatenate([y_train, y_val], axis=0))
y_train = lb.transform(y_train)
y_val = lb.transform(y_val)
dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1, enable_categorical=True)
dval = xgb.DMatrix(X_val, label=y_val, nthread=-1, enable_categorical=True)

gc.collect()

train size: (3116945, 22)
test size: (2077964, 21)
Preprocessing...


7

In [3]:
clf = xgb.train(
    params={
        "device": "cuda",
        "verbosity": 0,
        "objective": "binary:logistic",
        "tree_method": "hist",
        "eta": 0.0696294726051571,
        "max_depth": 0,
        "min_child_weight": 1,
        "gamma": 0.044230646284796976,
        "subsample": 0.9405269471473167,
        "colsample_bytree": 0.2999355523666192,
        "lambda": 0.9746051811186938,
        "alpha": 4.210861941737071,
    },
    dtrain=dtrain,
    num_boost_round=4000,
    evals=[(dval, "eval")],
    early_stopping_rounds=100,
)

[0]	eval-logloss:0.66001
[1]	eval-logloss:0.61575
[2]	eval-logloss:0.57570
[3]	eval-logloss:0.55795
[4]	eval-logloss:0.52225
[5]	eval-logloss:0.48525
[6]	eval-logloss:0.47057
[7]	eval-logloss:0.44723
[8]	eval-logloss:0.43535
[9]	eval-logloss:0.41720
[10]	eval-logloss:0.40005
[11]	eval-logloss:0.37674
[12]	eval-logloss:0.36649
[13]	eval-logloss:0.35597
[14]	eval-logloss:0.34308
[15]	eval-logloss:0.33103
[16]	eval-logloss:0.31933
[17]	eval-logloss:0.31250
[18]	eval-logloss:0.29683
[19]	eval-logloss:0.28615
[20]	eval-logloss:0.27188
[21]	eval-logloss:0.26300
[22]	eval-logloss:0.25648
[23]	eval-logloss:0.25098
[24]	eval-logloss:0.23661
[25]	eval-logloss:0.22314
[26]	eval-logloss:0.21487
[27]	eval-logloss:0.20822
[28]	eval-logloss:0.19884
[29]	eval-logloss:0.19320
[30]	eval-logloss:0.18871
[31]	eval-logloss:0.18013
[32]	eval-logloss:0.17587
[33]	eval-logloss:0.17241
[34]	eval-logloss:0.16912
[35]	eval-logloss:0.16222
[36]	eval-logloss:0.15498
[37]	eval-logloss:0.14863
[38]	eval-logloss:0.14

In [4]:
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef


y_pred = clf.predict(dval)
y_pred = (y_pred > 0.5).astype(int)

mcc = matthews_corrcoef(y_val, y_pred)
print(f"Validation mcc score: {mcc}")

Validation mcc score: 0.9848695096534142


In [5]:
import joblib as jl

submit_df = jl.load("../submit_df.pkl")
X_test_pkl = jl.load("../X_test.pkl")
dtest = xgb.DMatrix(X_test, nthread=-1, enable_categorical=True)

y_preds = clf.predict(dtest)
pred_classes = lb.inverse_transform(y_preds)
submit_df["class"] = pred_classes
submit_df.to_csv("submission_xgboost.csv", index=False)