In [1]:
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import numpy as np
import joblib as jl
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler,
    OrdinalEncoder,
    LabelBinarizer,
    OneHotEncoder,
    StandardScaler,
)
from sklearn.impute import SimpleImputer

train_path = "/home/manpm/Developers/kaggle/data/mushrooms/train.csv"
test_path = "/home/manpm/Developers/kaggle/data/mushrooms/test.csv"

In [2]:
# Prepare data
train = pd.read_csv(train_path)
print(f"train size: {train.shape}")
X_test = pd.read_csv(test_path)
print(f"test size: {X_test.shape}")
submit_df = pd.DataFrame()
submit_df["id"] = X_test["id"]
y_train = train["class"]
X_train = train.drop(columns=["id", "class"], axis=1)
X_test.drop(columns=["id"], inplace=True, axis=1)

# prepare columns
target = "class"

categorical_cols = (
    train.drop(columns=target).select_dtypes(include="object").columns.to_list()
)

numerical_cols = (
    train.drop(columns="id").select_dtypes(include="number").columns.to_list()
)
gc.collect()

print("Preprocessing...")
# get top 10 most frequent names
n = 15
for c in categorical_cols:
    train_mode_values = X_train[c].value_counts()[:n].index.tolist()
    X_train.loc[~X_train[c].isin(train_mode_values), c] = "other"
    test_mode_values = X_test[c].value_counts()[:n].index.tolist()
    X_test.loc[~X_test[c].isin(test_mode_values), c] = "other"


# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="median")),
        # ("minmax", MinMaxScaler()),
        ("scaler", StandardScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
data_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)
# Preprocess the data
X_all = pd.concat([X_train, X_test])
data_pipeline.fit(X_all)
del X_all
gc.collect()
# data_pipeline.set_output(transform="polars")
# X = pl.from_pandas(X_all)
# data_pipeline.fit(X)
X_test_transformed = data_pipeline.transform(X_test)
X_train_transformed = data_pipeline.transform(X_train)

# Binarize the target labels
lb = LabelBinarizer()

y_train_binarized = lb.fit_transform(y_train)

print(f"Exporting to pickple...")
jl.dump(X_train_transformed, "X_train.pkl")
jl.dump(y_train_binarized, "y_train.pkl")
jl.dump(X_test_transformed, "X_test.pkl")
jl.dump(submit_df, "submit_df.pkl")
jl.dump(lb, "lb.pkl")

gc.collect()

train size: (3116945, 22)
test size: (2077964, 21)
Preprocessing...
Exporting to pickple...


98

: 