In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from dataprep.eda import create_report

train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

y = train_df["target"]
n_train = len(train_df)
all_data = pd.concat((train_df, test_df), axis=0)
all_data.pop("target")

all_data.head()

In [None]:
create_report(train_df)

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(8, 6))

sns.boxplot(data=train_df, x="target", y="gravity", ax=ax[0][0])
sns.boxplot(data=train_df, x="target", y="ph", ax=ax[0][1])
sns.boxplot(data=train_df, x="target", y="osmo", ax=ax[0][2])
sns.boxplot(data=train_df, x="target", y="cond", ax=ax[1][0])
sns.boxplot(data=train_df, x="target", y="urea", ax=ax[1][1])
sns.boxplot(data=train_df, x="target", y="calc", ax=ax[1][2])
plt.tight_layout()

In [None]:
import lightgbm as lgbm
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


lgbm_params = {
    "random_state": 42,
    "objective": "binary",
    "metric": "auc",
    "n_estimators": 1000,
    "verbosity": -1,
    "early_stopping_round": 100,
}

X = all_data.iloc[:n_train].drop(columns=["id"])


def lgbm_objective(trial):
    params = {
        **lgbm_params,
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 50.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 50.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 5, 64),
        "max_depth": trial.suggest_int("max_depth", 1, 5),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 50),
    }
    model = lgbm.LGBMClassifier(**params)
    skf = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
    metrics = []
    evals_result = {}
    for train, test in skf.split(X, y):
        X_train = X.iloc[train]
        X_test = X.iloc[test]
        y_train = y.iloc[train]
        y_test = y.iloc[test]
        lgbm_train = lgbm.Dataset(X_train, y_train)
        lgbm_test = lgbm.Dataset(X_test, y_test)
        model = lgbm.train(
            params,
            valid_names=["train", "test"],
            valid_sets=[lgbm_train, lgbm_test],
            evals_result=evals_result,
            train_set=lgbm_train,
            verbose_eval=False,
        )
        pred = model.predict(X_test)
        metrics.append(roc_auc_score(y_test, pred))
    return np.mean(metrics)


lgbm_study = optuna.create_study(direction="maximize")
lgbm_study.optimize(lgbm_objective, n_trials=250)

In [None]:
print(f"best LightGBM ROC AUC: {lgbm_study.best_value:.4f}")

In [None]:
import catboost as cb
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


cb_params = {
    "random_state": 42,
    "eval_metric": "AUC",
    "bagging_temperature": 1.6,
    "verbose": 0,
}

X = all_data.iloc[:n_train].drop(columns=["id"])


def cb_objective(trial):
    params = {
        **cb_params,
        "iterations": trial.suggest_int("iterations", 100, 2000),
        "max_depth": trial.suggest_int("max_depth", 2, 6),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 0, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.2, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 2, 8),
    }
    model = cb.CatBoostClassifier(**params)
    skf = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
    metrics = []
    for train, test in skf.split(X, y):
        X_train = X.iloc[train]
        X_test = X.iloc[test]
        y_train = y.iloc[train]
        y_test = y.iloc[test]
        model = model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_test, y_test)])
        pred = model.predict(X_test)
        metrics.append(roc_auc_score(y_test, pred))
    return np.mean(metrics)


cb_study = optuna.create_study(direction="maximize")
cb_study.optimize(cb_objective, n_trials=250)

In [None]:
print(f"best Catboost study ROC AUC: {cb_study.best_value:.4f}")

In [None]:
import xgboost as xgb

xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "random_state": 42,
    "verbosity": 1,
}

X = all_data.iloc[:n_train].drop(columns=["id"])


def xgb_objective(trial):
    params = {
        **xgb_params,
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 1.0),
        "eta": trial.suggest_float("eta", 1e-3, 0.25, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "max_depth": trial.suggest_int("max_depth", 2, 6),
        "reg_lambda": trial.suggest_float("reg_lambda", 2, 6),
        "reg_alpha": trial.suggest_float("reg_alpha", 2, 6),
        "n_estimators": trial.suggest_int("n_estimators", 200, 2000),
    }
    skf = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
    model = xgb.XGBClassifier(**params)
    metrics = []
    for train, test in skf.split(X, y):
        X_train = X.iloc[train]
        X_test = X.iloc[test]
        y_train = y.iloc[train]
        y_test = y.iloc[test]
        model = model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        pred = model.predict(X_test)
        metrics.append(roc_auc_score(y_test, pred))
    return np.mean(metrics)


xgb_study = optuna.create_study(direction="maximize")
xgb_study.optimize(xgb_objective, n_trials=250)

In [None]:
print(f"best XGBoost study ROC AUC: {xgb_study.best_value:.4f}")

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lgbm_train = lgbm.Dataset(X_train, y_train)
lgbm_test = lgbm.Dataset(X_test, y_test)
model = lgbm.train(
    {**lgbm_params, **lgbm_study.best_params},
    valid_names=["train", "test"],
    valid_sets=[lgbm_train, lgbm_test],
    train_set=lgbm_train,
    verbose_eval=False,
)
pred = model.predict(X_test)
roc = roc_auc_score(y_test, pred)
print(f"LightGBM ROC AUC: {roc:.4f}")

In [None]:
submission = pd.read_csv("./data/sample_submission.csv")
pred = model.predict(test_df.drop("id", axis=1))
submission["target"] = pred

submission.to_csv("./data/lgbm_submission.csv", index=False)

In [None]:
model = cb.CatBoostClassifier(**{**cb_params, **cb_study.best_params})
model = model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_test, y_test)])
pred = model.predict(X_test)
roc = roc_auc_score(y_test, pred)
print(f"Catboost ROC AUC: {roc:.4f}")

In [None]:
pred = model.predict(test_df.drop("id", axis=1))
submission["target"] = pred

submission.to_csv("./data/cb_submission.csv", index=False)

In [None]:
model = xgb.XGBClassifier(**{**xgb_params, **xgb_study.best_params})
model = model = model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
pred = model.predict(X_test)
roc = roc_auc_score(y_test, pred)
print(f"XGBoost ROC AUC: {roc:.4f}")

In [None]:
pred = model.predict(test_df.drop("id", axis=1))
submission["target"] = pred

submission.to_csv("./data/xgb_submission.csv", index=False)

In [None]:
class EnsembleModel:
    def __init__(self, lgbm_params, xgb_params, cb_params, weights: tuple[float, float, float]):
        self.lgbm_params = lgbm_params
        self.lgbm_model = None
        self.xgb_model = xgb.XGBClassifier(**xgb_params)
        self.cb_model = cb.CatBoostClassifier(**cb_params)
        self.weights = weights

    def fit(self, X, y, X_val, y_val):
        lgbm_train = lgbm.Dataset(X, y)
        lgbm_test = lgbm.Dataset(X_val, y_val)
        self.lgbm_model = lgbm.train(
            {**lgbm_params, **lgbm_study.best_params},
            valid_names=["train", "test"],
            valid_sets=[lgbm_train, lgbm_test],
            train_set=lgbm_train,
            verbose_eval=False,
        )
        self.xgb_model.fit(X, y, eval_set=[(X_val, y_val)], verbose=False)
        self.cb_model.fit(X, y, early_stopping_rounds=100, eval_set=[(X_val, y_val)])
        return self

    def predict(self, X):
        lgbm_pred = self.lgbm_model.predict(X)
        xgb_pred = self.xgb_model.predict(X)
        cb_pred = self.cb_model.predict(X)
        pred = (
            self.weights[0] * lgbm_pred
            + self.weights[1] * xgb_pred
            + self.weights[2] * cb_pred
        )
        return pred

In [None]:
def drichlet_params(params, trial=None):
    p = []
    for x in params:
        p.append(x / sum(params))
    if trial is not None:
        for i, pp in enumerate(p):
            trial.set_user_attr(f"p_{i}", pp)
    return p

# https://optuna.readthedocs.io/en/latest/faq.html#how-do-i-suggest-variables-which-represent-the-proportion-that-is-are-in-accordance-with-dirichlet-distribution
def ensemble_objective(trial):
    n = 3
    x = []
    for i in range(n):
        x.append(-np.log(trial.suggest_float(f"x_{i}", 0, 1)))

    weights = drichlet_params(x, trial)

    skf = StratifiedKFold(n_splits=8, random_state=42, shuffle=True)
    ensemble = EnsembleModel(
        {**lgbm_params, **lgbm_study.best_params},
        {**xgb_params, **xgb_study.best_params},
        {**cb_params, **cb_study.best_params},
        weights=weights,
    )
    metrics = []
    for train, test in skf.split(X, y):
        X_train = X.iloc[train]
        X_test = X.iloc[test]
        y_train = y.iloc[train]
        y_test = y.iloc[test]
        ensemble.fit(X_train, y_train, X_test, y_test)
        pred = ensemble.predict(X_test)
        metrics.append(roc_auc_score(y_test, pred))
    return np.mean(metrics)

ensemble_study = optuna.create_study(direction="maximize")
ensemble_study.optimize(ensemble_objective, n_trials=50)

In [None]:
weights = drichlet_params(list(ensemble_study.best_params.values()))
ensemble = EnsembleModel(
    {**lgbm_params, **lgbm_study.best_params},
    {**xgb_params, **xgb_study.best_params},
    {**cb_params, **cb_study.best_params},
    weights=weights,
)

ensemble.fit(X_train, y_train, X_test, y_test)
pred = model.predict(X_test)
roc = roc_auc_score(y_test, pred)
print(f"Ensemble ROC AUC: {roc:.4f}")

In [None]:
pred = ensemble.predict(test_df.drop("id", axis=1))
submission["target"] = pred

submission.to_csv("./data/ensemble_submission.csv", index=False)