In [13]:
import os
import shutil
import subprocess
from collections import defaultdict
from functools import partial
from pathlib import Path

# Sub-modules and so on.
import numpy as np
import pandas as pd
import plotly.express as px
import scipy.stats as stats
from scipy.stats import probplot

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.ensemble import VotingClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None
FONT_COLOR = "#010D36"
BACKGROUND_COLOR = "#F6F5F5"

# Utility functions.
def download_dataset_from_kaggle(user, dataset, directory):
    command = "kaggle datasets download -d "
    filepath = directory / (dataset + ".zip")

    if not filepath.is_file():
        subprocess.run((command + user + "/" + dataset).split())
        filepath.parent.mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(dataset + ".zip", "data")
        shutil.move(dataset + ".zip", "data")


def download_competition_from_kaggle(competition):
    command = "kaggle competitions download -c "
    filepath = Path("data/" + competition + ".zip")

    if not filepath.is_file():
        subprocess.run((command + competition).split())
        Path("data").mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(competition + ".zip", "data")
        shutil.move(competition + ".zip", "data")


In [2]:
competition = "icr-identify-age-related-conditions"

if not ON_KAGGLE:
    download_competition_from_kaggle(competition)
    train_path = "data/train.csv"
    test_path = "data/test.csv"
    greeks_path = "data/greeks.csv"
else:
    train_path = f"/kaggle/input/{competition}/train.csv"
    test_path = f"/kaggle/input/{competition}/test.csv"
    greeks_path = f"/kaggle/input/{competition}/greeks.csv"

train = pd.read_csv(train_path, index_col="Id").rename(columns=str.strip)
test = pd.read_csv(test_path, index_col="Id").rename(columns=str.strip)
greeks = pd.read_csv(greeks_path, index_col="Id").rename(columns=str.strip)


In [3]:
numeric_data = train.select_dtypes("number")
numeric_cols = numeric_data.drop("Class", axis=1).columns.tolist()
r2_scores = defaultdict(tuple)

for feature in numeric_cols:
    orig = train[feature].dropna()
    _, (*_, R_orig) = probplot(orig, rvalue=True)
    _, (*_, R_log) = probplot(np.log(orig), rvalue=True)
    _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue=True)
    _, (*_, R_reci) = probplot(np.reciprocal(orig), rvalue=True)
    _, (*_, R_boxcox) = probplot(stats.boxcox(orig)[0], rvalue=True)
    _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)
    r2_scores[feature] = (
        R_orig * R_orig,
        R_log * R_log,
        R_sqrt * R_sqrt,
        R_reci * R_reci,
        R_boxcox * R_boxcox,
        R_yeojohn * R_yeojohn,
    )

r2_scores = pd.DataFrame(
    r2_scores, index=("Original", "Log", "Sqrt", "Reciprocal", "BoxCox", "YeoJohnson")
).T

r2_scores["Winner"] = r2_scores.idxmax(axis=1)

no_transform_cols = r2_scores.query("Winner == 'Original'").index
log_transform_cols = r2_scores.query("Winner == 'Log'").index
reciprocal_transform_cols = r2_scores.query("Winner == 'Reciprocal'").index
boxcox_transform_cols = r2_scores.query("Winner == 'BoxCox'").index
yeojohnson_transform_cols = r2_scores.query("Winner == 'YeoJohnson'").index


In [4]:
numeric_descr = (
    train.drop("Class", axis=1)
    .describe(percentiles=[0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99])
    .drop("count")
    .T.rename(columns=str.title)
)

semi_constant_mask = np.isclose(numeric_descr["Min"], numeric_descr["50%"])
semi_constant_descr = numeric_descr[semi_constant_mask]
semi_const_cols_thresholds = semi_constant_descr["50%"].to_dict()


In [5]:
semi_const_cols = semi_const_cols_thresholds.keys()

no_transform_cols = no_transform_cols.drop(semi_const_cols, errors="ignore")
log_transform_cols = log_transform_cols.drop(semi_const_cols, errors="ignore")
reciprocal_transform_cols = reciprocal_transform_cols.drop(semi_const_cols, errors="ignore")
boxcox_transform_cols = boxcox_transform_cols.drop(semi_const_cols, errors="ignore")
yeojohnson_transform_cols = yeojohnson_transform_cols.drop(semi_const_cols, errors="ignore")

preliminary_transform_preprocess = make_pipeline(
    make_column_transformer(
        (
            StandardScaler(),
            no_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.log, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            log_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                FunctionTransformer(func=np.reciprocal, feature_names_out="one-to-one"),
                StandardScaler(),
            ),
            reciprocal_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="box-cox", standardize=True),
            boxcox_transform_cols.to_list(),
        ),
        (
            PowerTransformer(method="yeo-johnson", standardize=True),
            yeojohnson_transform_cols.to_list(),
        ),
        (
            make_pipeline(
                SimpleImputer(strategy="most_frequent"),
                OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            ),
            make_column_selector(dtype_include=object),  # type: ignore
        ),
        *[
            (
                make_pipeline(
                    SimpleImputer(strategy="median"),
                    Binarizer(threshold=thresh),
                ),
                [col],
            )
            for col, thresh in semi_const_cols_thresholds.items()
        ],
        remainder="drop",
        verbose_feature_names_out=False,
        n_jobs=4,
    ),
    KNNImputer(n_neighbors=10, weights="distance"),
).set_output(transform="pandas")


In [6]:
def balanced_log_loss(y_true, y_pred, **kwargs):
    """Competition evaluation metric - negative balanced logarithmic loss.
    The overall effect is such that each class is roughly equally
    important for the final score."""
    N0, N1 = np.bincount(y_true)

    y0 = np.where(y_true == 0, 1, 0)
    y1 = np.where(y_true == 1, 1, 0)

    eps = kwargs.get("eps", 1e-15)
    y_pred = np.clip(y_pred, eps, 1 - eps)
    p0 = np.log(1 - y_pred)
    p1 = np.log(y_pred)

    return -(1 / N0 * np.sum(y0 * p0) + 1 / N1 * np.sum(y1 * p1)) * 0.5


In [7]:
def get_undersampling_fraction(y_true):
    N0, N1 = np.bincount(y_true)
    return 1 - N1 / N0


def assert_balanced_learning(y_train, n_samples_tol=1):
    N0, N1 = np.bincount(y_train)
    assert np.isclose(N0, N1, atol=n_samples_tol)


def get_sample_weights(y_true):
    N0, N1 = np.bincount(y_true)
    y0, y1 = np.unique(y_true)
    w0 = (N0 + N1) / N0
    w1 = (N0 + N1) / N1
    return np.where(y_true == y1, w1, w0)


In [8]:
def perform_postprocessing(
    y_proba,
    rounding=True,
    rounding_prec=4,
    boosting=True,
    boosting_coef=0.8,
    shifting=True,
    shifting_map=None,
):
    def my_ceil(x, prec=rounding_prec):
        return np.true_divide(np.ceil(x * 10**prec), 10**prec)

    def my_floor(x, prec=rounding_prec):
        return np.true_divide(np.floor(x * 10**prec), 10**prec)

    proba = y_proba.copy()

    if rounding:
        proba = np.where(proba > 0.5, my_floor(proba), my_ceil(proba))

    if boosting:
        odds = boosting_coef * proba / (1 - proba)
        proba = odds / (1 + odds)

    if shifting:
        if not shifting_map:
            shifting_map = {"low": (0.01, 0.02), "high": (0.99, 0.98)}
        low_shift_from, low_shift_to = shifting_map.get("low", (0.01, 0.02))
        high_shift_from, high_shift_to = shifting_map.get("high", (0.99, 0.98))
        proba[proba < low_shift_from] = low_shift_to
        proba[proba > high_shift_from] = high_shift_to

    return proba


In [9]:
n_bags = 20
n_folds = 10

np.random.seed(42)
seeds = np.random.randint(0, 19937, size=n_bags)


In [10]:
X = train.drop("Class", axis=1)
y = train.Class

undersampling_frac = get_undersampling_fraction(y)
y_proba = np.zeros_like(y, dtype=np.float64)
results = defaultdict(np.float64)
classifiers = defaultdict(object)

lgbm_params = {
    "max_depth": 4,
    "num_leaves": 9,
    "min_child_samples": 17,
    "n_estimators": 200,
    "learning_rate": 0.15,
    "colsample_bytree": 0.4,
    "min_split_gain": 1e-4,
    "reg_alpha": 1e-2,
    "reg_lambda": 5e-3,
}

xgb_params = {
    "max_depth": 2,
    "n_estimators": 200,
    "learning_rate": 0.4,
    "subsample": 0.6,
    "min_child_weight": 0.1,
    "max_delta_step": 0.35,
    "colsample_bytree": 0.3,
    "colsample_bylevel": 0.7,
    "min_split_loss": 1e-4,
    "reg_alpha": 2e-3,
    "reg_lambda": 6e-2,
}

svc_params = {
    "probability": True,
    "C": 3,
}


In [11]:
for bag, seed in enumerate(seeds):
    skfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_ids, valid_ids) in enumerate(skfold.split(X, y)):
        y_train_full = y.iloc[train_ids]
        to_undersample_ids = (
            y_train_full[y_train_full == 0]
            .sample(frac=undersampling_frac, random_state=seed)
            .index.to_numpy()
        )
        # Skfold returns numbers, but `y` is a series with IDs, so we map them.
        to_undersample_ids = [y.index.get_loc(idx) for idx in to_undersample_ids]
        train_ids = np.setdiff1d(train_ids, to_undersample_ids)

        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]

        assert_balanced_learning(y_train)

        current_ensemble = make_pipeline(
            preliminary_transform_preprocess,
            VotingClassifier(
                [
                    ("lgbm", LGBMClassifier(random_state=seed, **lgbm_params)),
                    ("xgb", XGBClassifier(random_state=seed, **xgb_params)),
                    ("svc", SVC(random_state=seed, **svc_params)),
                ],
                voting="soft",
                weights=(0.45, 0.45, 0.10),
            ),
        ).fit(X_train, y_train)

        y_proba[valid_ids] += current_ensemble.predict_proba(X_valid)[:, 1]
        classifiers[f"Voting Bag: {bag} Fold: {fold}"] = current_ensemble

y_proba_rescaled = y_proba / n_bags
print("Balanced Log Loss:", f"{balanced_log_loss(y, y_proba_rescaled):.5f}")


Balanced Log Loss: 0.21630


In [14]:
y_proba_frame = pd.DataFrame(
    {
        "Sample Integer Index": np.arange(0, len(y)),
        "Positive Class Probability": y_proba_rescaled,
        "Class": y.values.astype(str),
    },
    index=y.index,
)

fig = px.scatter(
    y_proba_frame.reset_index(),
    x="Positive Class Probability",
    y="Sample Integer Index",
    symbol="Class",
    symbol_sequence=["diamond", "circle"],
    color="Class",
    color_discrete_sequence=["#010D36", "#FF2079"],
    category_orders={"Class": ("0", "1")},
    hover_data="Id",
    opacity=0.6,
    height=540,
    width=840,
    title="Training Dataset - Out of Fold Predictions",
)
fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y=1.05,
        x=1,
        title="Class",
        itemsizing="constant",
    ),
    xaxis_range=[-0.02, 1.02],
)
fig.update_traces(marker_size=6)
fig.show()

In [15]:
if np.all(np.isclose(test.select_dtypes("number").sum(), 0)):
    test_numeric_cols = test.select_dtypes("number").columns
    test[test_numeric_cols] += 1e-9

test_ids = test.index
y_test = np.zeros_like(test_ids)

for classifier in classifiers.values():
    y_test += classifier.predict_proba(test)[:, 1]

y_test_rescaled = y_test / len(classifiers)
y_test_postprocessed = perform_postprocessing(y_test_rescaled)


In [16]:
submission = pd.DataFrame(
    {
        "Id": test_ids,
        "class_0": 1 - y_test_rescaled,
        "class_1": y_test_rescaled,
    }
).set_index("Id")

submission.to_csv("submission.csv")
submission.head()


Unnamed: 0_level_0,class_0,class_1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
00eed32682bb,0.544229,0.455771
010ebe33f668,0.544229,0.455771
02fa521e1838,0.544229,0.455771
040e15f562a2,0.544229,0.455771
046e85c7cc7f,0.544229,0.455771
