In [1]:
# %load ../general_settings.py
import os
import shutil
import warnings
import subprocess
from time import strftime
from collections import defaultdict
from collections import namedtuple
from copy import copy
from functools import partial
from itertools import product
from itertools import combinations
from pathlib import Path

import numpy as np
import pandas as pd
import scipy.stats as stats
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from colorama import Fore
from colorama import Style
from IPython.core.display import HTML

import joblib
import optuna

from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
from scipy.stats import gaussian_kde
from scipy.stats import probplot
from scipy.stats import yeojohnson
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None

# Colorama settings.
CLR = (Style.BRIGHT + Fore.BLACK) if ON_KAGGLE else (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN
RESET = Style.RESET_ALL

FONT_COLOR = "#545454"
BACKGROUND_COLOR = "#F6F5F5"

CELL_HOVER = {  # for row hover use <tr> instead of <td>
    "selector": "td:hover",
    "props": "background-color: #F6F5F5",
}
TEXT_HIGHLIGHT = {
    "selector": "td",
    "props": "color: #545454; font-weight: bold",
}
INDEX_NAMES = {
    "selector": ".index_name",
    "props": "font-style: italic; background-color: #005D68; color: #F2F2F0;",
}
HEADERS = {
    "selector": "th:not(.index_name)",
    "props": "font-style: italic; background-color: #005D68; color: #F2F2F0;",
}
DF_STYLE = (INDEX_NAMES, HEADERS, TEXT_HIGHLIGHT)

# Utility functions.
def download_dataset_from_kaggle(user, dataset, directory):
    command = "kaggle datasets download -d "
    filepath = directory / (dataset + ".zip")
    if not filepath.is_file():
        subprocess.run((command + user + "/" + dataset).split())
        filepath.parent.mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(dataset + ".zip", "data")
        shutil.move(dataset + ".zip", "data")


def download_competition_from_kaggle(competition):
    command = "kaggle competitions download -c "
    filepath = Path("data/" + competition + ".zip")
    if not filepath.is_file():
        subprocess.run((command + competition).split())
        Path("data").mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(competition + ".zip", "data")
        shutil.move(competition + ".zip", "data")


# Html `code` block highlight. Must be included at the end of all imports!
HTML(
    """
<style>
code {
    background: rgba(42, 53, 125, 0.10) !important;
    border-radius: 4px !important;
}
</style>
"""
)


In [136]:
competition = "playground-series-s3e18"

if not ON_KAGGLE:
    download_competition_from_kaggle(competition)
    train_path = "data/train.csv"
    test_path = "data/test.csv"
else:
    train_path = f"/kaggle/input/{competition}/train.csv"
    test_path = f"/kaggle/input/{competition}/test.csv"

cols_to_skip = ["EC3", "EC4", "EC5", "EC6"]
train = pd.read_csv(train_path, index_col="id", usecols=lambda x: x not in cols_to_skip)
test = pd.read_csv(test_path, index_col="id")

orig = pd.read_csv("data/mixed_desc.csv")
orig["EC1_EC2_EC3_EC4_EC5_EC6".split("_")] = (
    orig["EC1_EC2_EC3_EC4_EC5_EC6"].str.split("_", expand=True).astype(np.int32)
)
orig = orig[train.columns]

continuous_variables = test.select_dtypes("float64").columns
discrete_variables = test.select_dtypes("int64").columns
targets = ["EC1", "EC2"]


In [137]:
train = pd.concat([train, orig], ignore_index=True)

outlier_mask_train = (
    (train.FpDensityMorgan1 >= test.FpDensityMorgan1.min())
    & (train.FpDensityMorgan2 >= test.FpDensityMorgan2.min())
    & (train.FpDensityMorgan3 >= test.FpDensityMorgan3.min())
)

train = train[outlier_mask_train]
train = train.drop_duplicates(subset=np.r_[continuous_variables, discrete_variables])
train = train.sample(len(train), random_state=42)
train


Unnamed: 0,BertzCT,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3v,Chi4n,EState_VSA1,EState_VSA2,...,PEOE_VSA7,PEOE_VSA8,SMR_VSA10,SMR_VSA5,SlogP_VSA3,VSA_EState9,fr_COO,fr_COO2,EC1,EC2
4076,633.324985,11.612591,7.945187,7.945187,5.788793,5.788793,3.978973,2.515318,38.969379,0.000000,...,19.386400,0.000000,7.822697,24.415866,13.825658,54.250000,0,0,0,1
6411,378.018438,8.417121,4.473678,5.081423,3.402408,4.402408,2.488006,1.770579,36.992053,0.000000,...,0.000000,0.000000,5.969305,36.809859,4.794537,51.833333,1,1,1,0
12744,150.255712,5.092224,3.392018,3.392018,2.634453,2.634453,1.890522,1.289775,35.898777,11.250838,...,0.000000,6.076020,13.792002,0.000000,13.825658,41.666667,1,1,0,1
10285,602.980459,9.719545,6.451852,6.451852,4.797545,4.797545,3.857360,2.135281,29.929472,6.420822,...,6.420822,12.132734,11.876485,36.809859,14.325937,54.833333,1,1,1,0
4786,231.985252,5.698377,2.934030,2.934030,2.043527,2.043527,1.313518,0.620060,12.011146,12.462662,...,0.000000,0.000000,5.969305,25.304306,4.794537,32.666667,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5192,522.541944,13.288313,8.487080,8.487080,5.297097,5.297097,3.241954,2.086839,23.887631,18.628754,...,0.000000,13.468494,17.845790,6.041841,19.062800,50.166667,1,1,0,1
13424,276.306871,5.036581,3.414884,3.414884,2.272429,2.272429,1.474040,0.895230,5.969305,11.312963,...,12.132734,0.000000,11.656692,0.000000,0.000000,39.166667,1,1,1,1
5391,430.682001,7.194306,4.173829,4.173829,3.117951,3.117951,2.219537,1.186201,11.570356,5.749512,...,24.117007,23.520590,16.872230,6.420822,16.009896,33.287361,0,0,1,1
860,126.490225,4.715214,3.281314,3.281314,2.438992,2.438992,1.634858,0.533757,0.000000,5.907180,...,0.000000,0.000000,6.410095,25.304306,4.794537,30.000000,0,0,0,1


In [139]:
numeric_descr = (
    train[continuous_variables]
    .describe(percentiles=[0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99])
    .drop("count")
    .T.rename(columns=str.title)
)

In [140]:
r2_scores = defaultdict(tuple)

for feature in continuous_variables:
    orig = train[feature].dropna()
    _, (*_, R_orig) = probplot(orig, rvalue=True)
    _, (*_, R_yeojohn) = probplot(yeojohnson(orig)[0], rvalue=True)

    if orig.min() >= 0:
        _, (*_, R_log) = probplot(np.log1p(orig), rvalue=True)
        _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue=True)
    else:
        R_log, R_sqrt = np.nan, np.nan

    r2_scores[feature] = (
        R_orig * R_orig,
        R_yeojohn * R_yeojohn,
        R_log * R_log,
        R_sqrt * R_sqrt,
    )

r2_scores_frame = pd.DataFrame(r2_scores, index=("Original", "YeoJohnson", "Log", "Sqrt")).T

r2_scores_frame = (
    r2_scores_frame.assign(
        Winner=r2_scores_frame.idxmax(axis=1),
        m=r2_scores_frame.mean(axis=1),
    )
    .sort_values(by="m", ascending=False)
    .drop("m", axis=1)
)

yeojohnson_transform_cols = r2_scores_frame.query("Winner == 'YeoJohnson'").index
log_transform_cols = r2_scores_frame.query("Winner == 'Log'").index
sqrt_transform_cols = r2_scores_frame.query("Winner == 'Sqrt'").index

semi_constant_mask = np.isclose(numeric_descr["Min"], numeric_descr["50%"])
semi_constant_descr = numeric_descr[semi_constant_mask]
semi_constant_descr.style.set_table_styles(DF_STYLE).format(precision=3)
semi_const_cols_thresholds = semi_constant_descr["50%"].to_dict()

semi_const_cols = semi_const_cols_thresholds.keys()
yeojohnson_transform_cols = yeojohnson_transform_cols.drop(semi_const_cols, errors="ignore")
log_transform_cols = log_transform_cols.drop(semi_const_cols, errors="ignore")
sqrt_transform_cols = sqrt_transform_cols.drop(semi_const_cols, errors="ignore")


In [141]:
preliminary_preprocess = make_column_transformer(
    (
        make_pipeline(
            FunctionTransformer(func=np.log1p, feature_names_out="one-to-one"),
            StandardScaler(),
        ),
        log_transform_cols.to_list(),
    ),
    (
        make_pipeline(
            FunctionTransformer(func=np.sqrt, feature_names_out="one-to-one"),
            StandardScaler(),
        ),
        sqrt_transform_cols.to_list(),
    ),
    (
        PowerTransformer(method="yeo-johnson", standardize=True),
        yeojohnson_transform_cols.to_list(),
    ),
    (
        MinMaxScaler(),
        discrete_variables.to_list(),
    ),
    *[
        (
            Binarizer(threshold=thresh),
            [col],
        )
        for col, thresh in semi_const_cols_thresholds.items()
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")


In [142]:
pca_preprocess = make_column_transformer(
    (
        PCA(n_components=0.95, random_state=42),
        [
            "BertzCT",
            "ExactMolWt",
            "HeavyAtomMolWt",
            "Chi1",
            "Chi1n",
            "Chi1v",
            "Chi2n",
            "Chi2v",
            "Chi3v",
            "Chi4n",
        ],
    ),
    remainder="passthrough",
    verbose_feature_names_out=False,
).set_output(transform="pandas")


In [143]:
def logging_callback(study, frozen_trial):
    previous_best_value = study.user_attrs.get("previous_best_value", None)
    if previous_best_value != study.best_value:
        study.set_user_attr("previous_best_value", study.best_value)
        params = {key.split("__")[-1]: value for key, value in frozen_trial.params.items()}
        print(
            f"{CLR}Optuna Trial: {RED}{frozen_trial.number:03} {CLR}- ",
            f"{CLR}Best Value: {RED}{frozen_trial.value:.5f}\n",
            f"{CLR}Best Params: {RED}{params}",
            sep="",
        )


In [144]:
def define_model(trial, seed=None):
    selector_params = {
        "k": trial.suggest_int("selectkbest__k", 8, 18, step=2),
    }

    clf_params = {
        "random_state": trial.suggest_categorical(
            "gradientboostingclassifier__random_state", [seed or 42]
        ),
        "max_features": trial.suggest_categorical(
            "gradientboostingclassifier__max_features", ["sqrt"]
        ),
        "n_iter_no_change": trial.suggest_categorical(
            "gradientboostingclassifier__n_iter_no_change", [20]
        ),
        "min_samples_leaf": trial.suggest_int(
            "gradientboostingclassifier__min_samples_leaf", 32, 192, step=8
        ),
        "n_estimators": trial.suggest_int(
            "gradientboostingclassifier__n_estimators", 200, 500, step=100
        ),
        "learning_rate": trial.suggest_float(
            "gradientboostingclassifier__learning_rate", 1e-2, 2e-1
        ),
        "max_depth": trial.suggest_int("gradientboostingclassifier__max_depth", 3, 5, step=1),
        "subsample": trial.suggest_float(
            "gradientboostingclassifier__subsample", 0.5, 0.9, step=0.1
        ),
    }

    return make_pipeline(
        preliminary_preprocess,
        pca_preprocess,
        SelectKBest(**selector_params),
        GradientBoostingClassifier(**clf_params),
    )


def objective(trial, X, y, seed=None):
    seed = seed or 42
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    y_oof_proba = np.zeros_like(y, dtype=np.float32)
    model = define_model(trial, seed)

    for train_ids, valid_ids in skf.split(X, y):
        X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
        X_valid, y_valid = X.iloc[valid_ids], y.iloc[valid_ids]
        model.fit(X_train, y_train)
        y_oof_proba[valid_ids] = model.predict_proba(X_valid)[:, 1]
        # Pruning should be here if needed?

    return roc_auc_score(y, y_oof_proba)


def seed_study(seed, X, y, n_trials=100, n_jobs=-1):
    sampler = optuna.samplers.TPESampler(seed=seed)
    pruner = optuna.pruners.HyperbandPruner()  # Not used.
    study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner)
    study.optimize(
        partial(objective, X=X, y=y, seed=seed),  # type: ignore
        n_trials=n_trials,
        callbacks=[logging_callback],
        n_jobs=n_jobs,
    )

    best_model = make_pipeline(
        preliminary_preprocess, pca_preprocess, SelectKBest(), GradientBoostingClassifier()
    ).set_params(**study.best_params)

    best_value = np.round(study.best_value, 5)

    study_frame = study.trials_dataframe(
        attrs=("number", "value", "params", "state"),
    ).sort_values(by="value")

    return best_model, best_value, study_frame


In [145]:
np.random.seed(42)

models_ec1_path = Path(f"models/ec1/")
models_ec2_path = Path(f"models/ec2/")
models_ec1_path.mkdir(parents=True, exist_ok=True)
models_ec2_path.mkdir(parents=True, exist_ok=True)

n_seeds = 5
seeds = np.random.randint(0, 100, size=n_seeds).tolist()

X = train.drop(targets, axis=1)
y_ec1 = train.EC1
y_ec2 = train.EC2


In [None]:
optuna.logging.set_verbosity(optuna.logging.ERROR)


def search_for_best_model(X, y, seeds, models_path, n_trials=100, n_jobs=-1):
    model_study = namedtuple("Study", ["best_model", "best_value", "study_frame"])
    models = defaultdict(model_study)

    for seed in seeds:
        print(CLR + "Seed:", seed)
        best_model, best_value, study_frame = seed_study(
            seed, X, y, n_trials=n_trials, n_jobs=n_jobs
        )
        models[seed] = model_study(best_model, best_value, study_frame)
        joblib.dump(
            best_model,
            models_path
            / f"best_gb_seed_{seed:03}_{strftime('run_%Y_%m_%d_%H_%M_%S')}_value_{best_value:.5f}.pkl",
        )
        print()

    return models


print(CLR + "● EC1 Optimization:\n")
models_ec1 = search_for_best_model(X, y_ec1, seeds, models_ec1_path)

print(CLR + "● EC2 Optimization:\n")
models_ec2 = search_for_best_model(X, y_ec2, seeds, models_ec2_path)


In [146]:
import glob

models_ec1_paths = glob.glob(str(models_ec1_path / "*"))
models_ec2_paths = glob.glob(str(models_ec2_path / "*"))

loaded_models_ec1 = defaultdict(object)
loaded_models_ec2 = defaultdict(object)

for fname in models_ec1_paths:
    loaded_models_ec1[fname] = joblib.load(fname)

for fname in models_ec2_paths:
    loaded_models_ec2[fname] = joblib.load(fname)


In [147]:
def fit_and_predict_proba(X, y, models, X_test):
    y_test = np.zeros_like(X_test.index, dtype=np.float32)

    for seed, best_model in models.items():
        best_model.fit(X, y)
        y_test += best_model.predict_proba(X_test)[:, 1]

    return y_test / len(models)


y_test_ec1 = fit_and_predict_proba(X, y_ec1, loaded_models_ec1, test)
y_test_ec2 = fit_and_predict_proba(X, y_ec2, loaded_models_ec2, test)


In [148]:
submission = pd.DataFrame(
    {
        "Id": test.index,
        "EC1": y_test_ec1,
        "EC2": y_test_ec2,
    }
).set_index("Id")

submission.to_csv("submission.csv")
submission.head().style.set_table_styles(DF_STYLE)

Unnamed: 0_level_0,EC1,EC2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
14838,0.416994,0.760645
14839,0.846555,0.829053
14840,0.777841,0.748106
14841,0.720877,0.798277
14842,0.799203,0.815376


In [160]:
from sklearn.calibration import LinearSVC
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    HistGradientBoostingClassifier,
    GradientBoostingClassifier,
)
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier


model = make_pipeline(
    preliminary_preprocess,
    pca_preprocess,
    SelectKBest(k=16),
    # LGBMClassifier(random_state=42, max_depth=5, min_child_samples=64, colsample_bytree=0.2),
    # Nystroem(gamma=1e-2),
    # RandomForestClassifier(random_state=42, min_samples_leaf=64)
    Nystroem(),
    LinearSVC(C=2.1)
    # LogisticRegression(penalty="elasticnet", solver="saga", l1_ratio=0.5, C=2.1)
    # KNeighborsClassifier(1000, weights="distance")
    # GradientBoostingClassifier(
    #     min_samples_leaf=128,
    #     max_features="sqrt",
    #     n_estimators=500,
    #     n_iter_no_change=20,
    #     learning_rate=0.02,
    #     max_depth=5,
    #     random_state=42,
    #     subsample=0.5
    # )
    # XGBClassifier(max_depth=1),
)

scores_ec1 = cross_val_score(
    model,
    X,
    y_ec1,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring="roc_auc",
)

scores_ec2 = cross_val_score(
    model,
    X,
    y_ec2,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    scoring="roc_auc",
)

scores_ec1.mean(), scores_ec2.mean()
# (0.7069189117588829, 0.5906135076245935)


(0.7044966372452267, 0.5850340006620565)