In [1]:
import os
import shutil
import subprocess
from collections import defaultdict
from copy import copy
from itertools import product
from functools import reduce
from functools import partial
from pathlib import Path

# Sub-modules and so on.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import seaborn as sns
import scipy.stats as stats
from colorama import Fore, Style
from IPython.core.display import HTML
from IPython.display import display_html
from IPython.display import clear_output
from matplotlib.colors import Colormap
from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import fcluster, linkage
from scipy.spatial.distance import squareform
from scipy.stats import gaussian_kde, probplot

from sklearn.preprocessing import PowerTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.manifold import TSNE
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn import set_config
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import learning_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Binarizer, OrdinalEncoder
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_selection import RFE, SelectPercentile, SequentialFeatureSelector
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import AffinityPropagation, KMeans, MeanShift, Birch
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone
from sklearn.feature_selection import SelectPercentile
from scipy.stats import randint, expon, reciprocal, uniform

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None

# Colorama settings.
CLR = (Style.BRIGHT + Fore.BLACK) if ON_KAGGLE else (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN
RESET = Style.RESET_ALL

# Colors
DF_CMAP: Colormap = sns.light_palette("#8C92AC", as_cmap=True)  # type: ignore
FONT_COLOR = "#010D36"
BACKGROUND_COLOR = "#F6F5F5"

cell_hover = {  # for row hover use <tr> instead of <td>
    "selector": "td:hover",
    "props": "background-color: #F6F5F5",
}
text_highlight = {
    "selector": "td",
    "props": "color: #FF2079; font-weight: bold",
}
index_names = {
    "selector": ".index_name",
    "props": "font-style: italic; background-color: #010D36; color: #F2F2F0;",
}
headers = {
    "selector": "th:not(.index_name)",
    "props": "font-style: italic; background-color: #010D36; color: #F2F2F0;",
}
DF_STYLE = (index_names, headers, text_highlight)

MY_RC = {
    "axes.labelcolor": FONT_COLOR,
    "axes.labelsize": 10,
    "axes.labelpad": 15,
    "axes.labelweight": "bold",
    "axes.titlesize": 14,
    "axes.titleweight": "bold",
    "axes.titlepad": 15,
    "axes.facecolor": BACKGROUND_COLOR,
    "xtick.labelsize": 10,
    "xtick.color": FONT_COLOR,
    "ytick.labelsize": 10,
    "ytick.color": FONT_COLOR,
    "figure.titlesize": 14,
    "figure.titleweight": "bold",
    "figure.facecolor": BACKGROUND_COLOR,
    "figure.edgecolor": BACKGROUND_COLOR,
    "figure.dpi": 72,  # Locally Seaborn uses 72, meanwhile Kaggle 96.
    "font.size": 10,
    "font.family": "Serif",
    "text.color": FONT_COLOR,
}
sns.set_theme(rc=MY_RC)


# Utility functions.
def download_dataset_from_kaggle(user, dataset, directory):
    command = "kaggle datasets download -d "
    filepath = directory / (dataset + ".zip")

    if not filepath.is_file():
        subprocess.run((command + user + "/" + dataset).split())
        filepath.parent.mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(dataset + ".zip", "data")
        shutil.move(dataset + ".zip", "data")


def download_competition_from_kaggle(competition):
    command = "kaggle competitions download -c "
    filepath = Path("data/" + competition + ".zip")

    if not filepath.is_file():
        subprocess.run((command + competition).split())
        Path("data").mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(competition + ".zip", "data")
        shutil.move(competition + ".zip", "data")


In [2]:
competition = "icr-identify-age-related-conditions"

if not ON_KAGGLE:
    download_competition_from_kaggle(competition)
    train_path = "data/train.csv"
    test_path = "data/test.csv"
    greeks_path = "data/greeks.csv"
else:
    train_path = f"/kaggle/input/{competition}/train.csv"
    test_path = f"/kaggle/input/{competition}/test.csv"
    greeks_path = f"/kaggle/input/{competition}/greeks.csv"

train = pd.read_csv(train_path, index_col="Id").rename(columns=str.strip)
test = pd.read_csv(test_path, index_col="Id").rename(columns=str.strip)
greeks = pd.read_csv(greeks_path, index_col="Id").rename(columns=str.strip)


In [3]:
numeric_descr = (
    train.drop("Class", axis=1)
    .describe(percentiles=[0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99])
    .drop("count")
    .T.rename(columns=str.title)
)

In [4]:
numeric_data = train.select_dtypes("number")
numeric_cols = numeric_data.drop("Class", axis=1).columns.tolist()
r2_scores = defaultdict(tuple)

for feature in numeric_cols:
    orig = train[feature].dropna()
    _, (*_, R_orig) = probplot(orig, rvalue=True)
    _, (*_, R_log) = probplot(np.log(orig), rvalue=True)
    _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue=True)
    _, (*_, R_reci) = probplot(np.reciprocal(orig), rvalue=True)
    _, (*_, R_boxcox) = probplot(stats.boxcox(orig)[0], rvalue=True)
    _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)
    r2_scores[feature] = (
        R_orig * R_orig,
        R_log * R_log,
        R_sqrt * R_sqrt,
        R_reci * R_reci,
        R_boxcox * R_boxcox,
        R_yeojohn * R_yeojohn,
    )

r2_scores = pd.DataFrame(
    r2_scores, index=("Original", "Log", "Sqrt", "Reciprocal", "BoxCox", "YeoJohnson")
).T

r2_scores["Winner"] = r2_scores.idxmax(axis=1)


In [5]:
no_transform_cols = r2_scores.query("Winner == 'Original'").index
log_transform_cols = r2_scores.query("Winner == 'Log'").index
reciprocal_transform_cols = r2_scores.query("Winner == 'Reciprocal'").index
boxcox_transform_cols = r2_scores.query("Winner == 'BoxCox'").index
yeojohnson_transform_cols = r2_scores.query("Winner == 'YeoJohnson'").index


In [6]:
semi_constant_mask = np.isclose(numeric_descr["Min"], numeric_descr["50%"])
semi_constant_descr = numeric_descr[semi_constant_mask]
semi_const_cols_thresholds = semi_constant_descr["50%"].to_dict()


In [7]:
semi_const_cols = semi_const_cols_thresholds.keys()

no_transform_cols = no_transform_cols.drop(semi_const_cols, errors="ignore")
log_transform_cols = log_transform_cols.drop(semi_const_cols, errors="ignore")
reciprocal_transform_cols = reciprocal_transform_cols.drop(semi_const_cols, errors="ignore")
boxcox_transform_cols = boxcox_transform_cols.drop(semi_const_cols, errors="ignore")
yeojohnson_transform_cols = yeojohnson_transform_cols.drop(semi_const_cols, errors="ignore")

redundant_variables = [
    "CW", "GI", "GL", "AZ", "BP", "BR", "CF", "CR", "CS", 
    "CU", "DA", "DH", "GB", "GF", "DF", "DV", "EP", "GE",
]

In [8]:
preliminary_transform_preprocess = make_pipeline(
    make_column_transformer(
        (
            StandardScaler(),
            no_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.log, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            log_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.reciprocal, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            reciprocal_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="box-cox", standardize=True),
            boxcox_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="yeo-johnson", standardize=True),
            yeojohnson_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                SimpleImputer(strategy="most_frequent"),
                OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            ),
            make_column_selector(dtype_include=object),  # type: ignore
        ),
        *[
            (
                make_pipeline(
                    SimpleImputer(strategy="median"),
                    Binarizer(threshold=thresh),
                ),
                [col],
            )
            for col, thresh in semi_const_cols_thresholds.items()
        ],
        remainder="drop",
        verbose_feature_names_out=False,
        n_jobs=4,
    ),
    KNNImputer(n_neighbors=10, weights="distance"),
).set_output(transform="pandas")


In [9]:
def balanced_log_loss(y_true, y_pred, **kwargs):
    """Competition evaluation metric - negative balanced logarithmic loss.
    The overall effect is such that each class is roughly equally
    important for the final score."""
    N0, N1 = np.bincount(y_true)

    y0 = np.where(y_true == 0, 1, 0)
    y1 = np.where(y_true == 1, 1, 0)

    eps = kwargs.get("eps", 1e-15)
    y_pred = np.clip(y_pred, eps, 1 - eps)
    p0 = np.log(1 - y_pred)
    p1 = np.log(y_pred)

    return -(1 / N0 * np.sum(y0 * p0) + 1 / N1 * np.sum(y1 * p1)) * 0.5


In [10]:
def get_undersampling_fraction(y_true):
    N0, N1 = np.bincount(y_true)
    return 1 - N1 / N0


def assert_balanced_learning(y_train, n_samples_tol=1):
    N0, N1 = np.bincount(y_train)
    assert np.isclose(N0, N1, atol=n_samples_tol)


def get_sample_weights(y_true):
    N0, N1 = np.bincount(y_true)
    y0, y1 = np.unique(y_true)
    return np.where(y_true == y1, N0 / N1, 1.0)


def perform_postprocessing(
    y_proba,
    rounding=True,
    rounding_prec=4,
    boosting=True,
    boosting_coef=0.8,
    shifting=True,
    shifting_map=None,
):
    def my_ceil(x, prec=rounding_prec):
        return np.true_divide(np.ceil(x * 10**prec), 10**prec)

    def my_floor(x, prec=rounding_prec):
        return np.true_divide(np.floor(x * 10**prec), 10**prec)

    proba = y_proba.copy()

    if rounding:
        proba = np.where(proba > 0.5, my_floor(proba), my_ceil(proba))

    if boosting:
        odds = boosting_coef * proba / (1 - proba)
        proba = odds / (1 + odds)

    if shifting:
        if not shifting_map:
            shifting_map = {"low": (0.01, 0.02), "high": (0.99, 0.98)}
        low_shift_from, low_shift_to = shifting_map.get("low", (0.01, 0.02))
        high_shift_from, high_shift_to = shifting_map.get("high", (0.99, 0.98))
        proba[proba < low_shift_from] = low_shift_to
        proba[proba > high_shift_from] = high_shift_to

    return proba


# PLAYGROUND

In [11]:
def print_seed_results(seed, results_in, results_out):
    print(
        CLR + "Seed:",
        RED + f"{str(seed):3s}",
        CLR + "-",
        CLR + "Inner CV Score:",
        RED + f"{results_in.mean():.5f} \u00b1 {results_in.std():.5f}",
        CLR + "-",
        CLR + "Outer CV Score:",
        RED + f"{results_out.mean():.5f} \u00b1 {results_out.std():.5f}",
    )


def print_average_results(results_in_over_seeds, results_out_over_seeds):
    print(
        CLR + "\nAverage Inner CV Score:",
        RED + f"{results_in_over_seeds.mean():.5f} \u00b1 {results_in_over_seeds.std():.5f}",
        CLR + "-",
        CLR + "Average Outer CV Score:",
        RED + f"{results_out_over_seeds.mean():.5f} \u00b1 {results_out_over_seeds.std():.5f}\n",
    )


In [162]:
np.random.seed(42)

n_folds = 5
n_seeds = 10

X = train.drop("Class", axis=1)
y = train.Class

seeds = np.random.randint(0, 1000, size=n_seeds)
pos_weight = np.max(get_sample_weights(y))

In [241]:
relevant_variables = [
    "AB",
    "AF",
    "BQ",
    "CC",
    "CD",
    "CH",
    "CR",
    "DE",
    "DL",
    "DN",
    "DU",
    "DY",
    "EB",
    "EE",
    "EL",
    "EU",
    "FI",
    "FR",
    "GL",
]

xgb_params = {
    "max_depth": 2,
    "n_estimators": 200,
    "learning_rate": 0.15,
    "min_child_weight": 9.5,
    "colsample_bytree": 0.8,
    "colsample_bylevel": 0.8,
    "min_split_loss": 1.0,
    "reg_lambda": 3.0,
    "scale_pos_weight": pos_weight,
}

lgbm_params = {
    "max_depth": 3,
    "min_child_samples": 20,
    "n_estimators": 1000,
    "learning_rate": 9e-3,
    "colsample_bytree": 0.6,
    "reg_lambda": 0.3,
    "subsample": 0.8,
    "subsample_freq": 1,
    "scale_pos_weight": pos_weight,
}


In [240]:
X = train.drop("Class", axis=1)
y = train.Class

hyperparams = np.arange(0.6, 1.0, 0.1)
hp_plot_inn, hp_plot_out = defaultdict(tuple), defaultdict(tuple)

for hp in hyperparams:
    print(CLR + "HP:", RED + f"{hp}\n")
    results_inn_over_seeds = np.zeros((n_seeds, n_folds), dtype=np.float64)
    results_out_over_seeds = np.zeros((n_seeds, n_folds), dtype=np.float64)

    for n_seed, seed in enumerate(seeds):
        results_inn = np.empty(n_folds, dtype=np.float64)
        results_out = np.empty(n_folds, dtype=np.float64)
        skf_out = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        model_on_seed = make_pipeline(
            preliminary_transform_preprocess,
            FunctionTransformer(lambda X: X[relevant_variables]),
            # VotingClassifier(
            #     [
            #         ("lgbm", LGBMClassifier(random_state=seed, bagging_seed=seed, **lgbm_params)),
            #         ("xgb", XGBClassifier(random_state=seed, **xgb_params)),
            #     ],
            #     voting="soft",
            # )
            LGBMClassifier(random_state=seed, bagging_seed=seed, **lgbm_params, colsample_bytree=hp),
            #XGBClassifier(random_state=seed, **xgb_params),
        )

        for k_out, (train_ids, valid_ids) in enumerate(skf_out.split(X, y)):
            X_train_out, y_train_out = X.iloc[train_ids], y.iloc[train_ids]
            X_valid_out, y_valid_out = X.iloc[valid_ids], y.iloc[valid_ids]
            skf_inn = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
            models_inn = np.empty(n_folds, dtype=object)
            y_train_out_oof_proba = np.zeros_like(y_train_out, dtype=np.float64)

            for k_inn, (train_ids, valid_ids) in enumerate(
                skf_inn.split(X_train_out, y_train_out)
            ):
                X_train_inn, y_train_inn = X_train_out.iloc[train_ids], y_train_out.iloc[train_ids]
                X_valid_inn, y_valid_inn = X_train_out.iloc[valid_ids], y_train_out.iloc[valid_ids]

                model_inn = clone(model_on_seed).fit(
                    X_train_inn,
                    y_train_inn,
                    lgbmclassifier__sample_weight=get_sample_weights(y_train_inn),
                )

                y_train_out_oof_proba[valid_ids] = model_inn.predict_proba(X_valid_inn)[:, 1]
                models_inn[k_inn] = model_inn

            y_valid_out_proba = np.zeros_like(y_valid_out, dtype=np.float64)
            for model_inn in models_inn:
                y_valid_out_proba += model_inn.predict_proba(X_valid_out)[:, 1]

            results_inn[k_out] = balanced_log_loss(y_train_out, y_train_out_oof_proba)
            results_out[k_out] = balanced_log_loss(
                y_valid_out, y_valid_out_proba / len(models_inn)
            )

        results_inn_over_seeds[n_seed] = results_inn
        results_out_over_seeds[n_seed] = results_out
        print_seed_results(seed, results_inn, results_out)

    hp_plot_inn[hp] = (results_inn_over_seeds.mean(), results_inn_over_seeds.std())
    hp_plot_out[hp] = (results_out_over_seeds.mean(), results_out_over_seeds.std())
    print_average_results(results_inn_over_seeds, results_out_over_seeds)


[1m[37mHP: [1m[31m0.6

[1m[37mSeed: [1m[31m102 [1m[37m- [1m[37mInner CV Score: [1m[31m0.22570 ± 0.01919 [1m[37m- [1m[37mOuter CV Score: [1m[31m0.20632 ± 0.04612
[1m[37mSeed: [1m[31m435 [1m[37m- [1m[37mInner CV Score: [1m[31m0.21680 ± 0.01631 [1m[37m- [1m[37mOuter CV Score: [1m[31m0.22997 ± 0.05524
[1m[37mSeed: [1m[31m860 [1m[37m- [1m[37mInner CV Score: [1m[31m0.23185 ± 0.00768 [1m[37m- [1m[37mOuter CV Score: [1m[31m0.21251 ± 0.03239
[1m[37mSeed: [1m[31m270 [1m[37m- [1m[37mInner CV Score: [1m[31m0.22507 ± 0.02056 [1m[37m- [1m[37mOuter CV Score: [1m[31m0.20675 ± 0.05793
[1m[37mSeed: [1m[31m106 [1m[37m- [1m[37mInner CV Score: [1m[31m0.22448 ± 0.01053 [1m[37m- [1m[37mOuter CV Score: [1m[31m0.21539 ± 0.06088
[1m[37mSeed: [1m[31m71  [1m[37m- [1m[37mInner CV Score: [1m[31m0.22401 ± 0.02791 [1m[37m- [1m[37mOuter CV Score: [1m[31m0.21013 ± 0.03705
[1m[37mSeed: [1m[31m700 [1m[37m- [1m[37mI

In [None]:
# Ensemble: Average Inner CV Score: 0.22973 ± 0.01682 - Average Outer CV Score: 0.22068 ± 0.04711
# LGBM:     Average Inner CV Score: 0.22419 ± 0.01809 - Average Outer CV Score: 0.21594 ± 0.05395
# XGB:      Average Inner CV Score: 0.24249 ± 0.02462 - Average Outer CV Score: 0.22299 ± 0.06726


In [150]:
def get_relevant_variables(X_processed, y, seeds, xgb_params, lgbm_params, threshold=1.0):
    X = X_processed.copy()

    X["RANDOM_1"] = np.random.normal(size=len(X))
    X["RANDOM_2"] = np.random.normal(size=len(X))
    X["RANDOM_3"] = np.random.normal(size=len(X))
    X["RANDOM_4"] = np.random.normal(size=len(X))
    X["RANDOM_5"] = np.random.normal(size=len(X))
    X["RANDOM_6"] = np.random.normal(size=len(X))
    X["RANDOM_7"] = np.random.normal(size=len(X))

    feature_names = X.columns.to_list()
    lgbm_results = pd.DataFrame(index=feature_names)
    xgb_results = pd.DataFrame(index=feature_names)
    # lr_results = pd.DataFrame(index=feature_names)

    for seed in seeds:
        lgbm = LGBMClassifier(random_state=seed, **lgbm_params).fit(X, y)
        xgb = XGBClassifier(random_state=seed, **xgb_params).fit(X, y)
        lr = LogisticRegression(
            random_state=seed, penalty="l1", solver="liblinear", max_iter=1000
        ).fit(X, y)

        lgbm_results[seed] = pd.Series(lgbm.feature_importances_, index=feature_names)
        xgb_results[seed] = pd.Series(xgb.feature_importances_, index=feature_names)
        # lr_results[seed] = pd.Series(np.abs(lr.coef_[0]), index=feature_names)

    relevant_variables = []

    for df in (lgbm_results, xgb_results):
        df["SUM"] = df.sum(axis=1)
        df.sort_values(by="SUM", ascending=False, inplace=True)
        highest_random_id = df[df.index.str.startswith("RANDOM")].index[0]
        value_at = df.loc[highest_random_id, "SUM"]  # type: ignore
        relevant_vars = df.query(f"SUM > {value_at * threshold}").index.to_list()
        relevant_variables.append(relevant_vars)

    lgbm_vars, xgb_vars = relevant_variables
    return list(set(lgbm_vars) & set(xgb_vars))  # & set(lr_vars))


In [147]:
np.random.seed(23)

X = train.drop("Class", axis=1)
X_processed = preliminary_transform_preprocess.fit_transform(X)
y = train.Class
seeds = np.random.randint(0, 12345, size=100)

get_relevant_variables(X_processed, y, seeds, xgb_params, lgbm_params)


In [271]:
n_bags = 10
n_folds = 5

np.random.seed(123)
seeds = np.random.randint(0, 1234, size=n_bags)


In [272]:
X = train.drop("Class", axis=1)
y = train.Class

y_proba = np.zeros_like(y, dtype=np.float64)
classifiers = defaultdict(object)

for bag, seed in enumerate(seeds):
    skfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_ids, valid_ids) in enumerate(skfold.split(X, y)):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]

        current_ensemble = make_pipeline(
            preliminary_transform_preprocess,
            FunctionTransformer(lambda X: X[relevant_variables]),
            VotingClassifier(
                [
                    ("lgbm", LGBMClassifier(random_state=seed, **lgbm_params)),
                    ("xgb", XGBClassifier(random_state=seed, **xgb_params)),
                ],
                voting="soft",
                weights=(0.5, 0.5),
            ),
        ).fit(
            X_train,
            y_train,
            votingclassifier__sample_weight=get_sample_weights(y_train),
        )

        y_proba[valid_ids] += current_ensemble.predict_proba(X_valid)[:, 1]
        classifiers[f"Voting Bag: {bag} Fold: {fold}"] = current_ensemble

y_proba_rescaled = y_proba / n_bags
print("Balanced Log Loss:", f"{balanced_log_loss(y, y_proba_rescaled):.5f}")


Balanced Log Loss: 0.20011


In [273]:
y_proba_frame = pd.DataFrame(
    {
        "Sample Integer Index": np.arange(0, len(y)),
        "Positive Class Probability": y_proba_rescaled,
        "Class": y.values.astype(str),
    },
    index=y.index,
)

fig = px.scatter(
    y_proba_frame.reset_index(),
    x="Positive Class Probability",
    y="Sample Integer Index",
    symbol="Class",
    symbol_sequence=["diamond", "circle"],
    color="Class",
    color_discrete_sequence=["#010D36", "#FF2079"],
    category_orders={"Class": ("0", "1")},
    hover_data="Id",
    opacity=0.6,
    height=540,
    width=840,
    title="Training Dataset - Out of Fold Predictions",
)
fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y=1.05,
        x=1,
        title="Class",
        itemsizing="constant",
    ),
    xaxis_range=[-0.02, 1.02],
)
fig.update_traces(marker_size=6)
fig.show()