In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from dataprep.eda import create_report

train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

y = train_df["target"]
n_train = len(train_df)
all_data = pd.concat((train_df, test_df), axis=0)
all_data.pop("target")

all_data.head()

In [None]:
create_report(train_df)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(8, 6))

sns.boxplot(data=train_df, x="target", y="gravity", ax=ax[0][0])
sns.boxplot(data=train_df, x="target", y="ph", ax=ax[0][1])
sns.boxplot(data=train_df, x="target", y="osmo", ax=ax[0][2])
sns.boxplot(data=train_df, x="target", y="cond", ax=ax[1][0])
sns.boxplot(data=train_df, x="target", y="urea", ax=ax[1][1])
sns.boxplot(data=train_df, x="target", y="calc", ax=ax[1][2])
plt.tight_layout()

In [None]:
import lightgbm as lgbm
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


lgbm_params = {
    "random_state": 42,
    "objective": "binary",
    "metric": "auc",
    "n_estimators": 1000,
    "verbosity": -1,
    "early_stopping_round": 100,
    #############
    # "lambda_l1": 0.2,
    # "lambda_l2": 1e-05,
    # "num_leaves": 5,
    # "feature_fraction": 0.9,
    # "bagging_fraction": 0.7,
    # "bagging_freq": 7,
    # "min_child_samples": 25,
    # "num_iterations": 200,
    # "early_stopping_round": 100,
}

X = all_data.iloc[:n_train].drop(columns=["id"])


def objective(trial):
    params = {
        **lgbm_params,
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 50.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 50.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 5, 64),
        "max_depth": trial.suggest_int("max_depth", 1, 5),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 50),
    }
    model = lgbm.LGBMClassifier(**params)
    skf = StratifiedKFold(n_splits=8, random_state=42, shuffle=True)
    metrics = []
    evals_result = {}
    for train, test in skf.split(X, y):
        X_train = X.iloc[train]
        X_test = X.iloc[test]
        y_train = y.iloc[train]
        y_test = y.iloc[test]
        lgbm_train = lgbm.Dataset(X_train, y_train)
        lgbm_test = lgbm.Dataset(X_test, y_test)
        model = lgbm.train(
            params,
            valid_names=["train", "test"],
            valid_sets=[lgbm_train, lgbm_test],
            evals_result=evals_result,
            train_set=lgbm_train,
            verbose_eval=False,
        )
        pred = model.predict(X_test)
        metrics.append(roc_auc_score(y_test, pred))
    return np.mean(metrics)


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=250)


In [None]:
print(f"best study ROC AUC: {study.best_value:.4f}")

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lgbm_train = lgbm.Dataset(X_train, y_train)
lgbm_test = lgbm.Dataset(X_test, y_test)
model = lgbm.train(
    {**lgbm_params, **study.best_params},
    valid_names=["train", "test"],
    valid_sets=[lgbm_train, lgbm_test],
    train_set=lgbm_train,
    verbose_eval=False,
)
pred = model.predict(X_test)
roc = roc_auc_score(y_test, pred)
print(f"ROC AUC: {roc:.4f}")

In [None]:
submission = pd.read_csv("./data/sample_submission.csv")
pred = model.predict(test_df.drop("id", axis=1))
submission["target"] = pred

submission.to_csv("./data/lgbm_submission.csv", index=False)