In [None]:
import pandas as pd
import numpy as np

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submit = pd.read_csv("sample_submission.csv")

In [None]:
from sklearn.model_selection import train_test_split

target = "class"
X_train, X_val, y_train, y_val = train_test_split(
    train.drop(columns=target), train[target], test_size=0.2, random_state=42
)
X_test, y_test = test, submit[target]

In [None]:
categorical_columns = train.drop(columns=target).select_dtypes(include="object").columns.to_list()
numerical_cols = train.drop(columns="id").select_dtypes(include="number").columns.to_list()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
# Combine the preprocessing steps into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[("cat", categorical_transformer, categorical_columns)]
)

pipeline = Pipeline(steps=[("preprocessor", preprocessor)])
pipeline.fit_transform(X_train)
# Transform both the training and testing data
X_train_transformed = pipeline.transform(X_train).toarray()
X_test_transformed = pipeline.transform(X_test).toarray()
X_val_transformed = pipeline.transform(X_val).toarray()

# Convert transformed features and encoded targets to DataFrames for easier handling
X_train = pd.DataFrame(
    X_train_transformed,
    columns=pipeline.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(),
)
X_test = pd.DataFrame(
    X_test_transformed,
    columns=pipeline.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(),
)
X_val = pd.DataFrame(
    X_val_transformed,
    columns=pipeline.named_steps["preprocessor"]
    .named_transformers_["cat"]
    .named_steps["onehot"]
    .get_feature_names_out(),
)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
y_val = label_encoder.transform(y_val)

In [None]:
X_train.columns

In [None]:
import gc
import optuna
import lightgbm as lgb
from datetime import datetime, timezone
from sklearn.metrics import mean_absolute_error as mae
import warnings
from sklearn.metrics import r2_score


warnings.filterwarnings("ignore")
objective_mapping = {
    "regression": {"metric": "rmse"},
    "binary": {"metric": "binary_logloss"},
}
goal = "binary"
metric = objective_mapping[goal]["metric"]
best_params = {
    "objective": goal,
    "metric": metric,
    "boosting_type": "gbdt",
}
# Create LightGBM dataset
train_data = lgb.Dataset(
    data=X_train,
    label=y_train,
    free_raw_data=False,
    params={"verbose": -1},
)
valid_data = lgb.Dataset(
    data=X_val,
    label=y_val,
    free_raw_data=False,
    params={"verbose": -1},
)


def objective(trial):
    hyper_parameters = {
        **best_params,
        **{
            "lambda_l1": trial.suggest_float("lambda_l1", 0.01, 0.6),
            "lambda_l2": trial.suggest_float("lambda_l2", 0.01, 0.6),
            "num_leaves": trial.suggest_int("num_leaves", 100, 256),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 0.9),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 0.9),
            "bagging_freq": trial.suggest_int("bagging_freq", 8, 16),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
            "max_depth": trial.suggest_int("max_depth", 2, 64),
        },
    }

    model = lgb.train(
        params=hyper_parameters,
        train_set=train_data,
        valid_sets=[valid_data],
        num_boost_round=4000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=True),
            lgb.log_evaluation(period=50, show_stdv=True),
        ],
    )
    # Try to minimize the loss metric
    return model.best_score["valid_0"][metric]


# Create or load a study
today = datetime.now(timezone.utc).strftime("%Y_%m_%d")
curr_timestamp = int(datetime.now(timezone.utc).timestamp())
study_name = f"study"
study = optuna.create_study(
    study_name=study_name,
    storage=f"sqlite:///{study_name}.db",
    direction="minimize",
    load_if_exists=True,
)
study.optimize(objective, n_trials=100, timeout=None, show_progress_bar=True)
# Print best trial
best_trial = study.best_trial
print("Best trial:")
print(f"  {metric}_error:", best_trial.value)
print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))
study_best_params = study.best_params
best_params.update(study_best_params)
best_params

In [None]:
print("Training best model...")
final_model = lgb.train(
    params=best_params,
    train_set=train_data,
    valid_sets=[valid_data],
    num_boost_round=4000,
    categorical_feature=categorical_columns,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(period=50, show_stdv=True),
    ],
)
feature_importances = final_model.feature_importance()
y_pred = final_model.predict(X_test)

In [None]:
from sklearn.metrics import matthews_corrcoef

matthews_corrcoef(y_test, y_pred)