In [1]:
import polars as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import numpy as np
import joblib as jl

train_path = "/home/manpm/Developers/kaggle/data/mushrooms/train.csv"
test_path = "/home/manpm/Developers/kaggle/data/mushrooms/test.csv"

In [2]:
X_train = pl.scan_csv(train_path).collect().to_pandas()
X_test = pl.scan_csv(test_path).collect().to_pandas()
ids = X_test["id"]
jl.dump(ids, "submit_ids.pkl")
y_train = X_train["class"]
X_train.drop(columns=["class", "id"], inplace=True, axis=1)
X_test.drop(columns=["id"], inplace=True, axis=1)


In [3]:
weird_columns = [
    "cap-shape",
    "cap-surface",
    "cap-color",
    "gill-attachment",
    "gill-spacing",
    "gill-color",
    "veil-type",
    "veil-color",
    "has-ring",
    "ring-type",
    "spore-print-color",
    "habitat",
    "does-bruise-or-bleed",
    "stem-root",
    "stem-surface",
    "stem-color",
]

for col in weird_columns:
    allowed_vals = X_test[col].unique()
    X_train.loc[~X_train[col].isin(allowed_vals), col] = np.nan
    X_test.loc[~X_test[col].isin(allowed_vals), col] = np.nan

In [4]:
# from sklearn.preprocessing import PolynomialFeatures

# # Creating polynomial and interaction features on training set
# poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
# interaction_terms_train = poly.fit_transform(train[["class", "cap-diameter"]])
# interaction_terms_test = poly.transform(test[["class", "cap-diameter"]])

# interaction_df_train = pd.DataFrame(
#     interaction_terms_train, columns=poly.get_feature_names_out(["size", "num_rooms"])
# )
# interaction_df_test = pd.DataFrame(
#     interaction_terms_test, columns=poly.get_feature_names_out(["size", "num_rooms"])
# )

# # Add the interaction terms
# X_train = pd.concat([train, interaction_df_train], axis=1)
# X_test = pd.concat([test, interaction_df_test], axis=1)

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    LabelBinarizer,
    MinMaxScaler,
)
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

target = "class"

categorical_cols = X_train.select_dtypes(include="object").columns.to_list()

numerical_cols = X_train.select_dtypes(include="number").columns.to_list()

# Create the numerical and categorical pipelines
numerical_pipeline = Pipeline(
    steps=[
        ("num_imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler()),
        # ("minmax", MinMaxScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("cat_imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Combine the pipelines into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Create the full pipeline with the XGBoost model
preprocessing_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
    ]
)

# X_all = pd.concat([X_train, X_test])

# Preprocess the data
# X_train = pl.from_pandas(X_train)
# preprocessing_pipeline.set_output(transform="polars")
# X_all = pl.from_pandas(X_all)
# preprocessing_pipeline.fit(X_all)
X_train = preprocessing_pipeline.fit_transform(X_train).toarray()
gc.collect()


# X_test_transformed = preprocessor.transform(X_test)
# X_val_transformed = preprocessor.transform(X_val)
# Binarize the target labels
lb = LabelBinarizer()

y_train = lb.fit_transform(y_train)

# prepare data for training
# dtrain = xgb.DMatrix(X_train_transformed, label=y_train_binarized)
# dval = xgb.DMatrix(X_val_transformed, label=y_val_binarized)
# dtest = xgb.DMatrix(X_test_transformed)

gc.collect()

0

In [6]:
X_test = preprocessing_pipeline.transform(X_test).toarray()

: 

## X train finalize

In [7]:
import joblib as jl


def finalize_X(X, cols):

    scaled_X = pd.DataFrame(X, columns=cols)
    not_null_columns = []
    for c in scaled_X.columns.to_list():
        if not c.endswith("None"):
            not_null_columns.append(c)
    X = scaled_X[not_null_columns]
    return X


cols = preprocessing_pipeline.get_feature_names_out()
X_train = finalize_X(X_train, cols)
X_test = finalize_X(X_test, cols)

jl.dump(X_train, "X_train.pkl")
jl.dump(y_train, "y_train.pkl")
jl.dump(X_test, "X_test.pkl")

del X_train, y_train
gc.collect()

In [None]:
X_test = finalize_X(X_test, cols)
jl.dump(X_test, "X_test.pkl")

In [None]:
# plt.figure(figsize=(16, 10))
# sns.heatmap(scaled_X_train, annot=True)
# plt.show()

In [None]:
# from sklearn.preprocessing import PolynomialFeatures

# numerical_cols
# poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
# interaction_terms_train = poly.fit_transform(X_train)

In [None]:
# interaction_df_train = pd.DataFrame(
#     interaction_terms_train, columns=poly.get_feature_names_out()
# )
# X_train = pd.concat([X_train, interaction_df_train], axis=1)
# gc.collect()