<a href="https://www.kaggle.com/code/mmellinger66/s3e10-pulsar-models?scriptVersionId=122296735" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 10 - Pulsar Models</h1>
</div>

## Problem Type

Binary Classification

## Evaluation Metric

LogLoss

$$
LogLoss = \frac{1}{n} \sum_{i=1}^n [y_i log(\hat{y}_i) + (1 - y_i)log(1-\hat{y}_i)]
$$
```python
```

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="Class"
ID="id"

# Optuna
objective_direction = "maximize" # auc , minimize, maximize

In [3]:
class Config:
    path:str = "../input/playground-series-s3e10/"
    load_original_data:bool = True # Some Competitions use synthetic data, based on real data
    original_data_path:str = "../input/pulsar-classification-for-class-prediction/Pulsar.csv"
    ensemble_models:bool = False
    gpu:bool = False
    optimize:bool = True
    n_optuna_trials:int = 30 # 5, 10, 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 500  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5
        

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    """Deprecated: Use summary_statistics()"""
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    if Config.gpu:
        lgbm_dict["device"] = "gpu"
        lgbm_dict["boosting_type"] = "gbdt"
        lgbm_dict["gpu_platform_id"] = 0
        lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    if Config.gpu:
        params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
#         "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),

        "eval_metric": "rmse",  # auc, rmse, mae, logloss
        "objective": "reg:squarederror", # Normal Distribution
#         "objective": "reg:gamma", # Gamma Distribution

        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["hist"]
        ),  # hist, gpu_hist
#         "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)

def objective_clf_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
        "eval_metric": "auc",  # auc, rmse, mae
        "objective": "binary:logistic",
        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["hist"]
        ),  # hist, gpu_hist
        #         "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1
    return metrics.roc_auc_score(y_valid, oof)

#     return metrics.accuracy_score(y_valid, oof)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    #  model = CatBoostClassifier(**cb_params)
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.roc_auc_score(y_valid, oof)

#     return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=117564, Columns=10
 test data : Rows=78377, Columns=9
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
0,0,133.17188,59.71608,0.04313,-0.70338,54.91722,70.08444,0.7498,-0.64951,0
1,1,87.09375,36.25797,0.43547,2.26606,3.41722,21.86507,7.03933,52.68625,0
2,2,112.64062,39.81839,0.37964,0.92231,2.73077,15.68969,8.19347,85.64978,0
3,3,120.67969,45.91845,-0.09849,0.01178,2.69649,20.95466,8.18387,70.3329,0
4,4,134.07031,57.72011,-0.10777,-0.57334,1.10786,11.25505,16.10775,308.75377,0



[1m[91m=== Train Column Names ===[0m



Index(['id', 'Mean_Integrated', 'SD', 'EK', 'Skewness', 'Mean_DMSNR_Curve',
       'SD_DMSNR_Curve', 'EK_DMSNR_Curve', 'Skewness_DMSNR_Curve', 'Class'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'Mean_Integrated', 'SD', 'EK', 'Skewness', 'Mean_DMSNR_Curve', 'SD_DMSNR_Curve', 'EK_DMSNR_Curve', 'Skewness_DMSNR_Curve', 'Class']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

Skewness                4.39758
EK                      3.43500
Mean_DMSNR_Curve        3.42471
Class                   2.79694
Skewness_DMSNR_Curve    2.37403
SD_DMSNR_Curve          2.01034
id                      0.00000
EK_DMSNR_Curve         -0.04169
SD                     -0.52388
Mean_Integrated        -1.84135
dtype: float64
CPU times: user 289 ms, sys: 90.5 ms, total: 379 ms
Wall time: 678 ms


In [8]:
train.head()

Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
0,0,133.17188,59.71608,0.04313,-0.70338,54.91722,70.08444,0.7498,-0.64951,0
1,1,87.09375,36.25797,0.43547,2.26606,3.41722,21.86507,7.03933,52.68625,0
2,2,112.64062,39.81839,0.37964,0.92231,2.73077,15.68969,8.19347,85.64978,0
3,3,120.67969,45.91845,-0.09849,0.01178,2.69649,20.95466,8.18387,70.3329,0
4,4,134.07031,57.72011,-0.10777,-0.57334,1.10786,11.25505,16.10775,308.75377,0


In [9]:
def load_original_data(path:str) -> pd.DataFrame:
#     original = pd.read_csv(path, index_col=[0])
    original = pd.read_csv(path)

    original = original.reset_index()
    original['id'] = original['index'] + 100000
    original = original.drop(columns = ['index'])
    original = original.rename(columns = {'CementComponent ':'CementComponent'})
    original.set_index('id', inplace=True)
#     original = original[-original.depth.isna()]
    print(f"Shape={original.shape}")
    return original
#     original.head()

if Config.load_original_data:    
    original = load_original_data(Config.original_data_path)
    display(original.head())

Shape=(17898, 9)


Unnamed: 0_level_0,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100000,140.5625,55.68378,-0.23457,-0.69965,3.19983,19.11043,7.97553,74.24222,0
100001,102.50781,58.88243,0.46532,-0.51509,1.67726,14.86015,10.57649,127.39358,0
100002,103.01562,39.34165,0.32333,1.05116,3.12124,21.74467,7.73582,63.17191,0
100003,136.75,57.17845,-0.06841,-0.63624,3.64298,20.95928,6.8965,53.59366,0
100004,88.72656,40.67223,0.60087,1.12349,1.17893,11.46872,14.26957,252.56731,0


In [10]:
if Config.load_original_data:
    train['is_original']    = 0
    test['is_original']     = 0
    original['is_original'] = 1
#     combined = pd.concat([train, original], ignore_index=True) #.drop_duplicates()
    combined = pd.concat([train, original])

    train = combined
#     combined.head()
    print(f"Shape={combined.shape}")

Shape=(135462, 11)


In [11]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
Mean_Integrated,135462.0,111.23,25.01,5.81,104.03,116.5,126.35,192.62,625.32,-1.78,3.76
SD,135462.0,46.69,6.21,24.77,43.25,47.38,50.87,98.78,38.51,-0.41,0.96
EK,135462.0,0.5,1.12,-1.88,0.05,0.19,0.41,8.07,1.25,3.46,12.08
Skewness,135462.0,1.87,6.47,-1.79,-0.19,0.1,0.72,68.1,41.87,4.49,21.65
Mean_DMSNR_Curve,135462.0,12.05,27.1,0.21,2.07,2.81,4.23,223.39,734.44,3.48,12.6
SD_DMSNR_Curve,135462.0,26.21,19.97,7.37,14.92,18.22,24.99,110.64,398.7,2.0,2.95
EK_DMSNR_Curve,135462.0,8.07,3.94,-3.14,6.65,8.44,10.09,34.54,15.49,0.06,1.49
Skewness_DMSNR_Curve,135462.0,95.33,84.04,-1.98,47.65,83.39,124.13,1191.0,7061.88,2.54,13.06
Class,135462.0,0.09,0.29,0.0,0.0,0.0,0.0,1.0,0.08,2.8,5.85
is_original,135462.0,0.13,0.34,0.0,0.0,0.0,0.0,1.0,0.11,2.17,2.72


## Outlier Detection

In [12]:
# https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
    
def iqr(data:pd.DataFrame, var:str):# outliers detecion .
    q1 = np.quantile(data[var], 0.25)
    q3 = np.quantile(data[var], 0.75)
    diff = q3 - q1
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    return data[(data[var] < lower_t) | (data[var] > upper_t)]

# iqr(train, "squareMeters")

In [13]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy

def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(train)


Unnamed: 0,Outlier_percentage
Mean_DMSNR_Curve,17.71124
SD_DMSNR_Curve,15.85389
is_original,13.21256
EK_DMSNR_Curve,12.13477
Skewness,11.21348
EK,9.44841
Class,9.30593
Mean_Integrated,7.53717
Skewness_DMSNR_Curve,4.96523
SD,2.89602


In [14]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy
    
def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(test)


Unnamed: 0,Outlier_percentage
Mean_DMSNR_Curve,17.51662
SD_DMSNR_Curve,15.72655
EK_DMSNR_Curve,13.73234
Skewness,11.11168
EK,9.19275
Mean_Integrated,7.62979
Skewness_DMSNR_Curve,4.75395
SD,3.19609
id,0.0
is_original,0.0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

In [15]:
# train.drop(['cityCode'], axis=1, inplace=True)
# test.drop(['cityCode'], axis=1, inplace=True)


## Handle Outliers
- https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
- https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

In [16]:
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors', 'cityCode', 'squareMeters']
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors',  'squareMeters']

In [17]:
# https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

def remove_outliers(df:pd.DataFrame) -> pd.DataFrame:
    for c in features_with_outliers:
        if c == 'garage':
            first_percentile = df[c].quantile(0.001)
            df = df[df[c] > first_percentile]

        ninety_ninth_percentile = df[c].quantile(0.999)
        df = df[df[c] < ninety_ninth_percentile]
        #df_t = df_t[(df_t[c] > first_percentile) & (df_t[c] < ninety_ninth_percentile)]
    return df


In [18]:
# print(f'Before: {len(train)}')
# train = remove_outliers(train)
# print(f'After: {len(train)}')

In [19]:
train.head(10)

Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class,is_original
0,0.0,133.17188,59.71608,0.04313,-0.70338,54.91722,70.08444,0.7498,-0.64951,0,0
1,1.0,87.09375,36.25797,0.43547,2.26606,3.41722,21.86507,7.03933,52.68625,0,0
2,2.0,112.64062,39.81839,0.37964,0.92231,2.73077,15.68969,8.19347,85.64978,0,0
3,3.0,120.67969,45.91845,-0.09849,0.01178,2.69649,20.95466,8.18387,70.3329,0,0
4,4.0,134.07031,57.72011,-0.10777,-0.57334,1.10786,11.25505,16.10775,308.75377,0,0
5,5.0,131.63281,52.56321,-0.07525,-0.49583,2.19482,15.53743,9.03344,97.03241,0,0
6,6.0,110.9375,41.55695,0.31284,0.55902,1.96572,17.19147,10.39677,118.72427,0,0
7,7.0,120.20312,49.9279,-0.08999,-0.32137,3.2801,18.37684,8.19056,77.91724,0,0
8,8.0,112.41406,46.93987,0.28255,0.15178,3.33696,21.92953,7.69333,65.18628,0,0
9,9.0,99.85938,48.08919,0.69371,0.28166,3.41472,24.18191,7.95868,65.08458,0,0


In [20]:
train = train.reset_index(drop=True).copy()
train.head(10)

Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class,is_original
0,0.0,133.17188,59.71608,0.04313,-0.70338,54.91722,70.08444,0.7498,-0.64951,0,0
1,1.0,87.09375,36.25797,0.43547,2.26606,3.41722,21.86507,7.03933,52.68625,0,0
2,2.0,112.64062,39.81839,0.37964,0.92231,2.73077,15.68969,8.19347,85.64978,0,0
3,3.0,120.67969,45.91845,-0.09849,0.01178,2.69649,20.95466,8.18387,70.3329,0,0
4,4.0,134.07031,57.72011,-0.10777,-0.57334,1.10786,11.25505,16.10775,308.75377,0,0
5,5.0,131.63281,52.56321,-0.07525,-0.49583,2.19482,15.53743,9.03344,97.03241,0,0
6,6.0,110.9375,41.55695,0.31284,0.55902,1.96572,17.19147,10.39677,118.72427,0,0
7,7.0,120.20312,49.9279,-0.08999,-0.32137,3.2801,18.37684,8.19056,77.91724,0,0
8,8.0,112.41406,46.93987,0.28255,0.15178,3.33696,21.92953,7.69333,65.18628,0,0
9,9.0,99.85938,48.08919,0.69371,0.28166,3.41472,24.18191,7.95868,65.08458,0,0


In [21]:
excluded_features = [TARGET, ID, "fold", "is_original"]

In [22]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'Mean_Integrated', 'SD', 'EK', 'Skewness', 'Mean_DMSNR_Curve', 'SD_DMSNR_Curve', 'EK_DMSNR_Curve', 'Skewness_DMSNR_Curve', 'Class', 'is_original']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['Mean_Integrated',
 'SD',
 'EK',
 'Skewness',
 'Mean_DMSNR_Curve',
 'SD_DMSNR_Curve',
 'EK_DMSNR_Curve',
 'Skewness_DMSNR_Curve']

In [23]:
train, test = label_encoder(train, test, cat_features)
# train = pd.get_dummies(train,columns=['cut','color','clarity']) # Will remove original feature names
# test = pd.get_dummies(test,columns=['cut','color','clarity'])

In [24]:
train.head()

Unnamed: 0,id,Mean_Integrated,SD,EK,Skewness,Mean_DMSNR_Curve,SD_DMSNR_Curve,EK_DMSNR_Curve,Skewness_DMSNR_Curve,Class,is_original
0,0.0,133.17188,59.71608,0.04313,-0.70338,54.91722,70.08444,0.7498,-0.64951,0,0
1,1.0,87.09375,36.25797,0.43547,2.26606,3.41722,21.86507,7.03933,52.68625,0,0
2,2.0,112.64062,39.81839,0.37964,0.92231,2.73077,15.68969,8.19347,85.64978,0,0
3,3.0,120.67969,45.91845,-0.09849,0.01178,2.69649,20.95466,8.18387,70.3329,0,0
4,4.0,134.07031,57.72011,-0.10777,-0.57334,1.10786,11.25505,16.10775,308.75377,0,0


In [25]:
# cont_features, cat_features = feature_distribution_types(train, display=True)
# show_cardinality(train, cat_features)

# cont_features = [feature for feature in cont_features if feature not in excluded_features]
# cat_features = [feature for feature in cat_features if feature not in excluded_features]

# FEATURES = cont_features + cat_features
# FEATURES

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [26]:
%%time

study_name=objective_direction # Need better name

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction, study_name=study_name)
    study.optimize(
        lambda trial: objective_clf_xgb(trial, X_train, X_valid, y_train, y_valid),
#         lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),        
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best XGB trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction,study_name=study_name)
    study.optimize(
        lambda trial: objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid),
#         lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),        
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best LGBM trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
# best_cb_params = {}
best_cb_params = {'learning_rate': 0.45743264601999495,
                  'l2_leaf_reg': 41.338946049390074,
                  'bagging_temperature': 0.3472567739474319,
                  'random_strength': 1.7332249677756242, 
                  'depth': 1,
                  'min_data_in_leaf': 6}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction,study_name=study_name)
    study.optimize(
        lambda trial: objective_clf_cb(trial, X_train, X_valid, y_train, y_valid),
#         lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best Cat trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

[32m[I 2023-03-16 01:38:27,662][0m A new study created in memory with name: maximize[0m
[32m[I 2023-03-16 01:39:30,845][0m Trial 0 finished with value: 0.9548025861935564 and parameters: {'use_label_encoder': False, 'n_estimators': 3300, 'learning_rate': 0.040949770070104974, 'subsample': 0.7, 'colsample_bytree': 0.14, 'max_depth': 13, 'gamma': 30.700000000000003, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.236488162397603e-06, 'reg_alpha': 7.784125054746247e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 19.405041599388756}. Best is trial 0 with value: 0.9548025861935564.[0m


Number of boosting rounds: 3101


[32m[I 2023-03-16 01:40:26,941][0m Trial 1 finished with value: 0.9550898875923726 and parameters: {'use_label_encoder': False, 'n_estimators': 3900, 'learning_rate': 0.08618932272993717, 'subsample': 0.6, 'colsample_bytree': 0.64, 'max_depth': 14, 'gamma': 79.30000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 3.6069644076874915e-07, 'reg_alpha': 8.494014354443901e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 789.7960983585249}. Best is trial 1 with value: 0.9550898875923726.[0m


Number of boosting rounds: 46


[32m[I 2023-03-16 01:41:50,973][0m Trial 2 finished with value: 0.9557963541797533 and parameters: {'use_label_encoder': False, 'n_estimators': 4900, 'learning_rate': 0.032577126634422214, 'subsample': 0.9, 'colsample_bytree': 0.64, 'max_depth': 9, 'gamma': 29.200000000000003, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 5.101228036161281e-08, 'reg_alpha': 0.0009740356617162968, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 5.03987552226311}. Best is trial 2 with value: 0.9557963541797533.[0m


Number of boosting rounds: 151


[32m[I 2023-03-16 01:42:40,003][0m Trial 3 finished with value: 0.9571671745882367 and parameters: {'use_label_encoder': False, 'n_estimators': 3200, 'learning_rate': 0.24070091387522505, 'subsample': 0.38, 'colsample_bytree': 0.7300000000000001, 'max_depth': 4, 'gamma': 52.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 54.79880778878919, 'reg_alpha': 0.0189305996909529, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.885101636939554}. Best is trial 3 with value: 0.9571671745882367.[0m


Number of boosting rounds: 422


[32m[I 2023-03-16 01:43:04,789][0m Trial 4 finished with value: 0.9556538576403834 and parameters: {'use_label_encoder': False, 'n_estimators': 1400, 'learning_rate': 0.04821378920731908, 'subsample': 0.87, 'colsample_bytree': 0.47, 'max_depth': 18, 'gamma': 27.400000000000002, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.0561927688006582e-06, 'reg_alpha': 0.0009245285269493786, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 12.596277811902382}. Best is trial 3 with value: 0.9571671745882367.[0m


Number of boosting rounds: 116


[32m[I 2023-03-16 01:44:03,947][0m Trial 5 finished with value: 0.9560896716880376 and parameters: {'use_label_encoder': False, 'n_estimators': 3400, 'learning_rate': 0.05373358737061648, 'subsample': 0.77, 'colsample_bytree': 0.5700000000000001, 'max_depth': 10, 'gamma': 26.400000000000002, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.156462484861561e-08, 'reg_alpha': 1.5703881496076925e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 37.44450039767298}. Best is trial 3 with value: 0.9571671745882367.[0m


Number of boosting rounds: 104


[32m[I 2023-03-16 01:45:14,978][0m Trial 6 finished with value: 0.9571875312367182 and parameters: {'use_label_encoder': False, 'n_estimators': 4600, 'learning_rate': 0.03843874642370072, 'subsample': 0.21000000000000002, 'colsample_bytree': 0.8, 'max_depth': 8, 'gamma': 67.7, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.0014990715634319595, 'reg_alpha': 2.9000250566058656, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.14004653040877305}. Best is trial 6 with value: 0.9571875312367182.[0m


Number of boosting rounds: 321


[32m[I 2023-03-16 01:46:01,901][0m Trial 7 finished with value: 0.956336259789891 and parameters: {'use_label_encoder': False, 'n_estimators': 3000, 'learning_rate': 0.05130221987466495, 'subsample': 0.67, 'colsample_bytree': 0.7400000000000001, 'max_depth': 2, 'gamma': 91.5, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.01537935186156484, 'reg_alpha': 0.0018768798171275428, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 213.71969451921507}. Best is trial 6 with value: 0.9571875312367182.[0m


Number of boosting rounds: 102


[32m[I 2023-03-16 01:46:34,282][0m Trial 8 finished with value: 0.9578472684176677 and parameters: {'use_label_encoder': False, 'n_estimators': 1900, 'learning_rate': 0.03508388505382522, 'subsample': 0.69, 'colsample_bytree': 0.8300000000000001, 'max_depth': 12, 'gamma': 26.6, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 40.72593921761255, 'reg_alpha': 7.583332102262002e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.5018713287902626}. Best is trial 8 with value: 0.9578472684176677.[0m


Number of boosting rounds: 158


[32m[I 2023-03-16 01:47:39,975][0m Trial 9 finished with value: 0.956220136008471 and parameters: {'use_label_encoder': False, 'n_estimators': 3900, 'learning_rate': 0.151845865961519, 'subsample': 0.29000000000000004, 'colsample_bytree': 0.5800000000000001, 'max_depth': 19, 'gamma': 13.9, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 47.5298727308627, 'reg_alpha': 5.95757205816156, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.5993861972348558}. Best is trial 8 with value: 0.9578472684176677.[0m


Number of boosting rounds: 184


[32m[I 2023-03-16 01:48:16,990][0m Trial 10 finished with value: 0.9599999658979892 and parameters: {'use_label_encoder': False, 'n_estimators': 1500, 'learning_rate': 0.014003897975655858, 'subsample': 0.43000000000000005, 'colsample_bytree': 1.0, 'max_depth': 15, 'gamma': 1.6, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.4693248217023286, 'reg_alpha': 3.3352881402433605e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.8743570823342605}. Best is trial 10 with value: 0.9599999658979892.[0m


Number of boosting rounds: 438
Number of boosting rounds: 493


[32m[I 2023-03-16 01:48:57,547][0m Trial 11 finished with value: 0.9596455784442603 and parameters: {'use_label_encoder': False, 'n_estimators': 1400, 'learning_rate': 0.012105087696027823, 'subsample': 0.45000000000000007, 'colsample_bytree': 0.9800000000000001, 'max_depth': 15, 'gamma': 0.4, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.6210928454715843, 'reg_alpha': 6.697443394486857e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.7505844918999935}. Best is trial 10 with value: 0.9599999658979892.[0m
[32m[I 2023-03-16 01:49:18,072][0m Trial 12 finished with value: 0.9597617022256806 and parameters: {'use_label_encoder': False, 'n_estimators': 1000, 'learning_rate': 0.012312407575473177, 'subsample': 0.44000000000000006, 'colsample_bytree': 1.0, 'max_depth': 16, 'gamma': 4.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.4230498043572056, 'reg_alpha': 1.9590226312500565e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.6182845393

Number of boosting rounds: 875
Number of boosting rounds: 1243


[32m[I 2023-03-16 01:50:04,610][0m Trial 13 finished with value: 0.9603950666486806 and parameters: {'use_label_encoder': False, 'n_estimators': 2200, 'learning_rate': 0.010547824693046238, 'subsample': 0.5, 'colsample_bytree': 1.0, 'max_depth': 17, 'gamma': 4.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.16410061993553737, 'reg_alpha': 1.0867984740957526e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.949215958678347}. Best is trial 13 with value: 0.9603950666486806.[0m
[32m[I 2023-03-16 01:50:42,070][0m Trial 14 finished with value: 0.953713051074421 and parameters: {'use_label_encoder': False, 'n_estimators': 2200, 'learning_rate': 0.017598630892559048, 'subsample': 0.14, 'colsample_bytree': 0.39, 'max_depth': 20, 'gamma': 48.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.10797778175911647, 'reg_alpha': 2.7055145893075464e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.13403677947184847}. Best is trial 13 with value: 0.96

Number of boosting rounds: 723


[32m[I 2023-03-16 01:51:25,905][0m Trial 15 finished with value: 0.9588757335913591 and parameters: {'use_label_encoder': False, 'n_estimators': 2300, 'learning_rate': 0.018684433538339786, 'subsample': 0.54, 'colsample_bytree': 0.91, 'max_depth': 16, 'gamma': 11.700000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.00014337120352723163, 'reg_alpha': 1.5872646928654388e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 4.470216116845156}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 1917


[32m[I 2023-03-16 01:52:12,983][0m Trial 16 finished with value: 0.9539106014497666 and parameters: {'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.01104325566388776, 'subsample': 0.33, 'colsample_bytree': 0.26, 'max_depth': 17, 'gamma': 42.900000000000006, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 3.080267494904122, 'reg_alpha': 8.912563369013665e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.5281149269969063}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 640


[32m[I 2023-03-16 01:52:45,052][0m Trial 17 finished with value: 0.9581669586839018 and parameters: {'use_label_encoder': False, 'n_estimators': 1700, 'learning_rate': 0.02040521855885901, 'subsample': 0.51, 'colsample_bytree': 0.8700000000000001, 'max_depth': 7, 'gamma': 14.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.013712439964114231, 'reg_alpha': 1.8977970100838803e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.2944721287228742}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 1569
Number of boosting rounds: 884


[32m[I 2023-03-16 01:53:50,777][0m Trial 18 finished with value: 0.959625221795779 and parameters: {'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.010328293204229885, 'subsample': 0.25, 'colsample_bytree': 1.0, 'max_depth': 12, 'gamma': 0.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.943665125167577, 'reg_alpha': 1.2840138602819719e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.4753517401382412}. Best is trial 13 with value: 0.9603950666486806.[0m
[32m[I 2023-03-16 01:54:09,121][0m Trial 19 finished with value: 0.955212027483261 and parameters: {'use_label_encoder': False, 'n_estimators': 1000, 'learning_rate': 0.024219639709301053, 'subsample': 0.11, 'colsample_bytree': 0.35, 'max_depth': 20, 'gamma': 15.9, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.0001862204218724332, 'reg_alpha': 5.655595919940415e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.3025964901994424}. Best is trial 13 with value: 0.960

Number of boosting rounds: 536


[32m[I 2023-03-16 01:54:42,765][0m Trial 20 finished with value: 0.9518800438603335 and parameters: {'use_label_encoder': False, 'n_estimators': 1800, 'learning_rate': 0.014783803966914345, 'subsample': 0.4, 'colsample_bytree': 0.060000000000000005, 'max_depth': 6, 'gamma': 64.5, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.03658897734439527, 'reg_alpha': 1.439391886142664e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 6.992456348739655}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 1795


[32m[I 2023-03-16 01:55:03,592][0m Trial 21 finished with value: 0.9594480280689146 and parameters: {'use_label_encoder': False, 'n_estimators': 1000, 'learning_rate': 0.014051595969504359, 'subsample': 0.44000000000000006, 'colsample_bytree': 0.9500000000000001, 'max_depth': 16, 'gamma': 5.800000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.35336996993697617, 'reg_alpha': 1.3356968511646906e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.7430946288191567}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 882


[32m[I 2023-03-16 01:55:31,366][0m Trial 22 finished with value: 0.9579286950115932 and parameters: {'use_label_encoder': False, 'n_estimators': 1400, 'learning_rate': 0.010213980679445463, 'subsample': 0.59, 'colsample_bytree': 0.9, 'max_depth': 18, 'gamma': 7.7, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.15520380475113, 'reg_alpha': 1.7587152336580578e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.1802239141802158}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 1319


[32m[I 2023-03-16 01:56:08,365][0m Trial 23 finished with value: 0.958718896512976 and parameters: {'use_label_encoder': False, 'n_estimators': 2000, 'learning_rate': 0.014332448553100206, 'subsample': 0.49, 'colsample_bytree': 1.0, 'max_depth': 14, 'gamma': 16.5, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.21022438269238014, 'reg_alpha': 1.013762427483726e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.949626502832397}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 1937


[32m[I 2023-03-16 01:56:33,960][0m Trial 24 finished with value: 0.958515330028162 and parameters: {'use_label_encoder': False, 'n_estimators': 1500, 'learning_rate': 0.024366115906788666, 'subsample': 0.37, 'colsample_bytree': 0.77, 'max_depth': 16, 'gamma': 38.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.006323918199937219, 'reg_alpha': 1.5309173911564513e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.701751184816286}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 202


[32m[I 2023-03-16 01:56:53,330][0m Trial 25 finished with value: 0.9569816564318274 and parameters: {'use_label_encoder': False, 'n_estimators': 1100, 'learning_rate': 0.013501563988996882, 'subsample': 1.0, 'colsample_bytree': 0.89, 'max_depth': 11, 'gamma': 19.6, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.10696194476203, 'reg_alpha': 8.025202781493642e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.0087004013596064}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 595
Number of boosting rounds: 1499


[32m[I 2023-03-16 01:57:53,155][0m Trial 26 finished with value: 0.9599245554135318 and parameters: {'use_label_encoder': False, 'n_estimators': 2700, 'learning_rate': 0.01730700695565137, 'subsample': 0.59, 'colsample_bytree': 0.68, 'max_depth': 14, 'gamma': 1.9000000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 8.430952737365185, 'reg_alpha': 9.819995067409054e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 4.021580246348866}. Best is trial 13 with value: 0.9603950666486806.[0m
[32m[I 2023-03-16 01:58:47,352][0m Trial 27 finished with value: 0.9574521676669763 and parameters: {'use_label_encoder': False, 'n_estimators': 2700, 'learning_rate': 0.016527336058836304, 'subsample': 0.61, 'colsample_bytree': 0.68, 'max_depth': 14, 'gamma': 20.900000000000002, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 9.81034644581183, 'reg_alpha': 7.258722538392847e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 6.618775748715724}. Best is tri

Number of boosting rounds: 381


[32m[I 2023-03-16 01:59:41,751][0m Trial 28 finished with value: 0.9567230361110375 and parameters: {'use_label_encoder': False, 'n_estimators': 2900, 'learning_rate': 0.024646993910074404, 'subsample': 0.76, 'colsample_bytree': 0.8200000000000001, 'max_depth': 18, 'gamma': 35.9, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 9.613481863325777, 'reg_alpha': 2.8499680407301046e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 11.603539035892586}. Best is trial 13 with value: 0.9603950666486806.[0m


Number of boosting rounds: 173


[32m[I 2023-03-16 02:00:22,681][0m Trial 29 finished with value: 0.955586771585471 and parameters: {'use_label_encoder': False, 'n_estimators': 2200, 'learning_rate': 0.019510398630622067, 'subsample': 0.64, 'colsample_bytree': 0.47, 'max_depth': 13, 'gamma': 99.10000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.6704507887277901, 'reg_alpha': 6.587058623899309e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 21.347440314352376}. Best is trial 13 with value: 0.9603950666486806.[0m
[32m[I 2023-03-16 02:00:22,694][0m A new study created in memory with name: maximize[0m


Number of boosting rounds: 286
Number of finished trials: 30
Best XGB trial parameters: {'use_label_encoder': False, 'n_estimators': 2200, 'learning_rate': 0.010547824693046238, 'subsample': 0.5, 'colsample_bytree': 1.0, 'max_depth': 17, 'gamma': 4.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.16410061993553737, 'reg_alpha': 1.0867984740957526e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.949215958678347}
Best score: 0.9603950666486806
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0337333	valid_1's binary_logloss: 0.037633
Did not meet early stopping. Best iteration is:
[866]	training's binary_logloss: 0.0300901	valid_1's binary_logloss: 0.0370078


[32m[I 2023-03-16 02:00:49,097][0m Trial 0 finished with value: 0.9592708343420505 and parameters: {'n_estimators': 866, 'reg_alpha': 0.08001897398969939, 'reg_lambda': 9.767161277750757, 'colsample_bytree': 0.8800000000000001, 'num_leaves': 618, 'feature_fraction': 0.8177488948816001, 'bagging_fraction': 0.3441836747119266, 'bagging_freq': 1, 'min_child_samples': 50, 'subsample': 0.56, 'learning_rate': 0.011507357796312363, 'max_depth': 36, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 0.9592708343420505.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 8.11804e-05	valid_1's binary_logloss: 0.0827904


[32m[I 2023-03-16 02:01:19,424][0m Trial 1 finished with value: 0.959250477693569 and parameters: {'n_estimators': 1000, 'reg_alpha': 0.00012176911730295674, 'reg_lambda': 3.3856413980416394e-07, 'colsample_bytree': 0.93, 'num_leaves': 879, 'feature_fraction': 0.6185479965714025, 'bagging_fraction': 0.8660080466632275, 'bagging_freq': 2, 'min_child_samples': 196, 'subsample': 0.55, 'learning_rate': 0.1429431494979603, 'max_depth': 21, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 0.9592708343420505.[0m


Early stopping, best iteration is:
[41]	training's binary_logloss: 0.0242549	valid_1's binary_logloss: 0.0377135
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0093	valid_1's binary_logloss: 0.0458583
Early stopping, best iteration is:
[155]	training's binary_logloss: 0.0274152	valid_1's binary_logloss: 0.0378191


[32m[I 2023-03-16 02:01:53,886][0m Trial 2 finished with value: 0.9579286950115932 and parameters: {'n_estimators': 835, 'reg_alpha': 0.0040781849486138735, 'reg_lambda': 0.0005570156394166266, 'colsample_bytree': 0.8400000000000001, 'num_leaves': 461, 'feature_fraction': 0.45380912321518585, 'bagging_fraction': 0.6948570838234392, 'bagging_freq': 11, 'min_child_samples': 258, 'subsample': 0.55, 'learning_rate': 0.038765209190646216, 'max_depth': 62, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 0.9592708343420505.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0125547	valid_1's binary_logloss: 0.0396469
Early stopping, best iteration is:
[308]	training's binary_logloss: 0.0217343	valid_1's binary_logloss: 0.0378053


[32m[I 2023-03-16 02:02:53,970][0m Trial 3 finished with value: 0.9593929742329388 and parameters: {'n_estimators': 862, 'reg_alpha': 0.018713393998597037, 'reg_lambda': 0.13158492795576773, 'colsample_bytree': 0.6100000000000001, 'num_leaves': 938, 'feature_fraction': 0.4881589394654611, 'bagging_fraction': 0.944145444675804, 'bagging_freq': 14, 'min_child_samples': 145, 'subsample': 0.21000000000000002, 'learning_rate': 0.018364336494145383, 'max_depth': 40, 'random_state': 42, 'n_jobs': 4}. Best is trial 3 with value: 0.9593929742329388.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0275417	valid_1's binary_logloss: 0.0380562


[32m[I 2023-03-16 02:03:01,512][0m Trial 4 finished with value: 0.9596862917412232 and parameters: {'n_estimators': 936, 'reg_alpha': 0.0004521284848264292, 'reg_lambda': 0.00013614277492574872, 'colsample_bytree': 0.27, 'num_leaves': 960, 'feature_fraction': 0.19846334607833005, 'bagging_fraction': 0.5289030111706683, 'bagging_freq': 13, 'min_child_samples': 195, 'subsample': 0.84, 'learning_rate': 0.09588368369528825, 'max_depth': 4, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Early stopping, best iteration is:
[286]	training's binary_logloss: 0.0308882	valid_1's binary_logloss: 0.0375536
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 8.58515e-06	valid_1's binary_logloss: 0.0887356


[32m[I 2023-03-16 02:03:41,137][0m Trial 5 finished with value: 0.9585560433251249 and parameters: {'n_estimators': 737, 'reg_alpha': 0.001813482909619348, 'reg_lambda': 0.01955346344075912, 'colsample_bytree': 0.8, 'num_leaves': 643, 'feature_fraction': 0.6065142929658067, 'bagging_fraction': 0.9995977205538297, 'bagging_freq': 10, 'min_child_samples': 11, 'subsample': 0.1, 'learning_rate': 0.20824228463239997, 'max_depth': 20, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Early stopping, best iteration is:
[19]	training's binary_logloss: 0.00880141	valid_1's binary_logloss: 0.0422019
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.00398883	valid_1's binary_logloss: 0.0493675
Early stopping, best iteration is:
[206]	training's binary_logloss: 0.0171569	valid_1's binary_logloss: 0.0383766


[32m[I 2023-03-16 02:04:48,095][0m Trial 6 finished with value: 0.9585824160830745 and parameters: {'n_estimators': 740, 'reg_alpha': 0.0008562878399529781, 'reg_lambda': 1.0561662053430886e-07, 'colsample_bytree': 0.6900000000000001, 'num_leaves': 635, 'feature_fraction': 0.5529901892507911, 'bagging_fraction': 0.6358628801224809, 'bagging_freq': 1, 'min_child_samples': 35, 'subsample': 0.43000000000000005, 'learning_rate': 0.022700043295646396, 'max_depth': 31, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Training until validation scores don't improve for 500 rounds


[32m[I 2023-03-16 02:04:59,878][0m Trial 7 finished with value: 0.9587739503489521 and parameters: {'n_estimators': 793, 'reg_alpha': 3.58943040029365e-08, 'reg_lambda': 2.865838665708772e-05, 'colsample_bytree': 0.9400000000000001, 'num_leaves': 591, 'feature_fraction': 0.6483115419180474, 'bagging_fraction': 0.31153056080972136, 'bagging_freq': 12, 'min_child_samples': 118, 'subsample': 0.97, 'learning_rate': 0.23118531459370428, 'max_depth': 59, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


[500]	training's binary_logloss: 0.736811	valid_1's binary_logloss: 0.748637
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.0335669	valid_1's binary_logloss: 0.0401206
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.00409436	valid_1's binary_logloss: 0.0490433


[32m[I 2023-03-16 02:05:34,927][0m Trial 8 finished with value: 0.9588757335913591 and parameters: {'n_estimators': 837, 'reg_alpha': 0.15112289766072845, 'reg_lambda': 0.0006741059048554918, 'colsample_bytree': 0.8600000000000001, 'num_leaves': 332, 'feature_fraction': 0.5689727806778256, 'bagging_fraction': 0.8311597786617515, 'bagging_freq': 6, 'min_child_samples': 236, 'subsample': 0.13, 'learning_rate': 0.052939945753006847, 'max_depth': 85, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Early stopping, best iteration is:
[114]	training's binary_logloss: 0.0249556	valid_1's binary_logloss: 0.0378286
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.00380288	valid_1's binary_logloss: 0.0641836
Early stopping, best iteration is:
[89]	training's binary_logloss: 0.0243447	valid_1's binary_logloss: 0.0387735


[32m[I 2023-03-16 02:06:07,873][0m Trial 9 finished with value: 0.9585213461376303 and parameters: {'n_estimators': 805, 'reg_alpha': 0.0009445725979220648, 'reg_lambda': 7.842771042592499e-07, 'colsample_bytree': 0.05, 'num_leaves': 525, 'feature_fraction': 0.2903544053469092, 'bagging_fraction': 0.5385049094526596, 'bagging_freq': 4, 'min_child_samples': 124, 'subsample': 0.8099999999999999, 'learning_rate': 0.08245084240279223, 'max_depth': 33, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0394204	valid_1's binary_logloss: 0.0403139
Did not meet early stopping. Best iteration is:
[950]	training's binary_logloss: 0.0381731	valid_1's binary_logloss: 0.0394649


[32m[I 2023-03-16 02:06:13,739][0m Trial 10 finished with value: 0.9584602761921861 and parameters: {'n_estimators': 950, 'reg_alpha': 6.313837820638036, 'reg_lambda': 2.0293576086489926e-08, 'colsample_bytree': 0.31, 'num_leaves': 90, 'feature_fraction': 0.12611531929544312, 'bagging_fraction': 0.1500506745732107, 'bagging_freq': 15, 'min_child_samples': 294, 'subsample': 0.78, 'learning_rate': 0.0850763648543857, 'max_depth': 2, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0356162	valid_1's binary_logloss: 0.0383527
Did not meet early stopping. Best iteration is:
[911]	training's binary_logloss: 0.0321488	valid_1's binary_logloss: 0.0372894


[32m[I 2023-03-16 02:06:27,046][0m Trial 11 finished with value: 0.9588553769428776 and parameters: {'n_estimators': 911, 'reg_alpha': 2.2179763060119098e-05, 'reg_lambda': 0.07396701915934607, 'colsample_bytree': 0.45, 'num_leaves': 994, 'feature_fraction': 0.3304136292504587, 'bagging_fraction': 0.5282318646993382, 'bagging_freq': 15, 'min_child_samples': 175, 'subsample': 0.33, 'learning_rate': 0.011406996485483073, 'max_depth': 5, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0352418	valid_1's binary_logloss: 0.0446028
Did not meet early stopping. Best iteration is:
[907]	training's binary_logloss: 0.0311163	valid_1's binary_logloss: 0.0468798


[32m[I 2023-03-16 02:07:51,143][0m Trial 12 finished with value: 0.9543403993879526 and parameters: {'n_estimators': 907, 'reg_alpha': 7.462570192069648e-06, 'reg_lambda': 2.0623808801286117e-05, 'colsample_bytree': 0.24, 'num_leaves': 828, 'feature_fraction': 0.10127697199799957, 'bagging_fraction': 0.9934682045494572, 'bagging_freq': 13, 'min_child_samples': 114, 'subsample': 0.74, 'learning_rate': 0.024959688768158593, 'max_depth': 78, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.00183576	valid_1's binary_logloss: 0.0573616


[32m[I 2023-03-16 02:08:20,135][0m Trial 13 finished with value: 0.9589571601852847 and parameters: {'n_estimators': 963, 'reg_alpha': 0.06075810891611342, 'reg_lambda': 0.01779766188462553, 'colsample_bytree': 0.51, 'num_leaves': 974, 'feature_fraction': 0.3575569414394423, 'bagging_fraction': 0.6959033663819423, 'bagging_freq': 8, 'min_child_samples': 199, 'subsample': 0.29000000000000004, 'learning_rate': 0.10169384592331669, 'max_depth': 47, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Early stopping, best iteration is:
[66]	training's binary_logloss: 0.0245671	valid_1's binary_logloss: 0.0374244
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.00977982	valid_1's binary_logloss: 0.0438605
Early stopping, best iteration is:
[121]	training's binary_logloss: 0.0285473	valid_1's binary_logloss: 0.0385416


[32m[I 2023-03-16 02:08:51,278][0m Trial 14 finished with value: 0.9585417027861118 and parameters: {'n_estimators': 897, 'reg_alpha': 1.0572766325177799e-05, 'reg_lambda': 0.6821873602818678, 'colsample_bytree': 0.65, 'num_leaves': 770, 'feature_fraction': 0.23149888178203926, 'bagging_fraction': 0.4606516774538788, 'bagging_freq': 8, 'min_child_samples': 81, 'subsample': 0.97, 'learning_rate': 0.054106456803980685, 'max_depth': 71, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.012539	valid_1's binary_logloss: 0.0401787
Early stopping, best iteration is:
[284]	training's binary_logloss: 0.0236604	valid_1's binary_logloss: 0.0377849


[32m[I 2023-03-16 02:09:57,508][0m Trial 15 finished with value: 0.9589571601852847 and parameters: {'n_estimators': 944, 'reg_alpha': 0.020569922821540836, 'reg_lambda': 0.00145868484135007, 'colsample_bytree': 0.35, 'num_leaves': 760, 'feature_fraction': 0.45564805978227646, 'bagging_fraction': 0.8148526656655098, 'bagging_freq': 13, 'min_child_samples': 161, 'subsample': 0.6799999999999999, 'learning_rate': 0.019567269038215863, 'max_depth': 98, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 0.9596862917412232.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0148456	valid_1's binary_logloss: 0.0398092
Early stopping, best iteration is:
[253]	training's binary_logloss: 0.0240251	valid_1's binary_logloss: 0.0374055


[32m[I 2023-03-16 02:10:26,148][0m Trial 16 finished with value: 0.9602178729218163 and parameters: {'n_estimators': 881, 'reg_alpha': 0.8506882343950526, 'reg_lambda': 8.180930515916033e-05, 'colsample_bytree': 0.13, 'num_leaves': 287, 'feature_fraction': 0.9693856065295061, 'bagging_fraction': 0.6279779711877121, 'bagging_freq': 10, 'min_child_samples': 221, 'subsample': 0.26, 'learning_rate': 0.0355232132712243, 'max_depth': 15, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 0.9602178729218163.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0265954	valid_1's binary_logloss: 0.0374498
Early stopping, best iteration is:
[322]	training's binary_logloss: 0.0293495	valid_1's binary_logloss: 0.0372312


[32m[I 2023-03-16 02:10:46,439][0m Trial 17 finished with value: 0.9593869581234705 and parameters: {'n_estimators': 995, 'reg_alpha': 4.345895840035738, 'reg_lambda': 2.4002475571073026e-05, 'colsample_bytree': 0.11, 'num_leaves': 182, 'feature_fraction': 0.9805994528081028, 'bagging_fraction': 0.6396626681765802, 'bagging_freq': 10, 'min_child_samples': 232, 'subsample': 0.41000000000000003, 'learning_rate': 0.0371474983103911, 'max_depth': 14, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 0.9602178729218163.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0206745	valid_1's binary_logloss: 0.0390665
Early stopping, best iteration is:
[208]	training's binary_logloss: 0.0279209	valid_1's binary_logloss: 0.0377818


[32m[I 2023-03-16 02:11:02,733][0m Trial 18 finished with value: 0.9588146636459149 and parameters: {'n_estimators': 885, 'reg_alpha': 2.3489220471307584, 'reg_lambda': 0.0001067333170379254, 'colsample_bytree': 0.18, 'num_leaves': 352, 'feature_fraction': 0.725532560821207, 'bagging_fraction': 0.4442188931786403, 'bagging_freq': 9, 'min_child_samples': 299, 'subsample': 0.86, 'learning_rate': 0.06759191028345336, 'max_depth': 12, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 0.9602178729218163.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.037143	valid_1's binary_logloss: 0.0393695
Did not meet early stopping. Best iteration is:
[929]	training's binary_logloss: 0.0364168	valid_1's binary_logloss: 0.0391767


[32m[I 2023-03-16 02:11:09,383][0m Trial 19 finished with value: 0.958718896512976 and parameters: {'n_estimators': 929, 'reg_alpha': 0.6192683131983596, 'reg_lambda': 5.6080814901946585e-06, 'colsample_bytree': 0.42, 'num_leaves': 11, 'feature_fraction': 0.9826806363206323, 'bagging_fraction': 0.7311114460047887, 'bagging_freq': 6, 'min_child_samples': 218, 'subsample': 0.66, 'learning_rate': 0.12372398681752142, 'max_depth': 1, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 0.9602178729218163.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0136927	valid_1's binary_logloss: 0.0411499
Early stopping, best iteration is:
[181]	training's binary_logloss: 0.0275643	valid_1's binary_logloss: 0.037572


[32m[I 2023-03-16 02:11:33,463][0m Trial 20 finished with value: 0.9593462448265077 and parameters: {'n_estimators': 706, 'reg_alpha': 0.685707020654998, 'reg_lambda': 2.5753474382992585e-06, 'colsample_bytree': 0.22000000000000003, 'num_leaves': 299, 'feature_fraction': 0.8520880832049008, 'bagging_fraction': 0.6096726657447472, 'bagging_freq': 12, 'min_child_samples': 262, 'subsample': 0.44000000000000006, 'learning_rate': 0.03818846633452573, 'max_depth': 22, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 0.9602178729218163.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0134326	valid_1's binary_logloss: 0.0394033
Early stopping, best iteration is:
[322]	training's binary_logloss: 0.022151	valid_1's binary_logloss: 0.0377563


[32m[I 2023-03-16 02:12:40,918][0m Trial 21 finished with value: 0.9588817497008272 and parameters: {'n_estimators': 864, 'reg_alpha': 0.007150611334538365, 'reg_lambda': 0.002638739975486858, 'colsample_bytree': 0.7200000000000001, 'num_leaves': 923, 'feature_fraction': 0.44605454432384073, 'bagging_fraction': 0.8974849706855643, 'bagging_freq': 14, 'min_child_samples': 141, 'subsample': 0.24000000000000002, 'learning_rate': 0.0167743731810689, 'max_depth': 45, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 0.9602178729218163.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0169906	valid_1's binary_logloss: 0.0385467
Early stopping, best iteration is:
[246]	training's binary_logloss: 0.0249597	valid_1's binary_logloss: 0.0374839


[32m[I 2023-03-16 02:13:07,107][0m Trial 22 finished with value: 0.9606944002664333 and parameters: {'n_estimators': 806, 'reg_alpha': 0.023907052180379338, 'reg_lambda': 0.00016067546536308253, 'colsample_bytree': 0.5800000000000001, 'num_leaves': 758, 'feature_fraction': 0.7009197470196303, 'bagging_fraction': 0.7257499940958625, 'bagging_freq': 13, 'min_child_samples': 181, 'subsample': 0.19, 'learning_rate': 0.029190899464656268, 'max_depth': 11, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 0.9606944002664333.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0167452	valid_1's binary_logloss: 0.0383206
Did not meet early stopping. Best iteration is:
[782]	training's binary_logloss: 0.0113202	valid_1's binary_logloss: 0.0401378


[32m[I 2023-03-16 02:13:34,146][0m Trial 23 finished with value: 0.9604968498910874 and parameters: {'n_estimators': 782, 'reg_alpha': 0.19740616256941654, 'reg_lambda': 0.00018890916558404183, 'colsample_bytree': 0.53, 'num_leaves': 761, 'feature_fraction': 0.7184264670451591, 'bagging_fraction': 0.7610771291448487, 'bagging_freq': 11, 'min_child_samples': 186, 'subsample': 0.18, 'learning_rate': 0.030094840600423202, 'max_depth': 11, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 0.9606944002664333.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.010174	valid_1's binary_logloss: 0.0413195
Early stopping, best iteration is:
[201]	training's binary_logloss: 0.0249709	valid_1's binary_logloss: 0.0375496


[32m[I 2023-03-16 02:14:12,001][0m Trial 24 finished with value: 0.958977516833766 and parameters: {'n_estimators': 783, 'reg_alpha': 0.45432577287983816, 'reg_lambda': 0.00012843822020422538, 'colsample_bytree': 0.56, 'num_leaves': 429, 'feature_fraction': 0.6946846983486356, 'bagging_fraction': 0.7835992912133944, 'bagging_freq': 11, 'min_child_samples': 174, 'subsample': 0.17, 'learning_rate': 0.029572870111097185, 'max_depth': 26, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 0.9606944002664333.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0156703	valid_1's binary_logloss: 0.0386704
Early stopping, best iteration is:
[302]	training's binary_logloss: 0.0224108	valid_1's binary_logloss: 0.0374187


[32m[I 2023-03-16 02:14:39,938][0m Trial 25 finished with value: 0.9597066483897045 and parameters: {'n_estimators': 814, 'reg_alpha': 0.15544806479662596, 'reg_lambda': 0.0022691794931875555, 'colsample_bytree': 0.41, 'num_leaves': 743, 'feature_fraction': 0.7754606672963512, 'bagging_fraction': 0.762593613217676, 'bagging_freq': 6, 'min_child_samples': 258, 'subsample': 0.32, 'learning_rate': 0.03005916264245858, 'max_depth': 14, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 0.9606944002664333.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.0316919	valid_1's binary_logloss: 0.0370244
Did not meet early stopping. Best iteration is:
[766]	training's binary_logloss: 0.0306206	valid_1's binary_logloss: 0.0369322


[32m[I 2023-03-16 02:14:54,341][0m Trial 26 finished with value: 0.9602178729218163 and parameters: {'n_estimators': 766, 'reg_alpha': 9.436658876605062, 'reg_lambda': 6.291176198117743e-06, 'colsample_bytree': 0.51, 'num_leaves': 207, 'feature_fraction': 0.8918377981784181, 'bagging_fraction': 0.7475662081791575, 'bagging_freq': 10, 'min_child_samples': 209, 'subsample': 0.24000000000000002, 'learning_rate': 0.045431461909389834, 'max_depth': 12, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 0.9606944002664333.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.014413	valid_1's binary_logloss: 0.0391188
Early stopping, best iteration is:
[250]	training's binary_logloss: 0.0240904	valid_1's binary_logloss: 0.0376098


[32m[I 2023-03-16 02:15:27,462][0m Trial 27 finished with value: 0.9595498113113217 and parameters: {'n_estimators': 768, 'reg_alpha': 1.2642147921482911, 'reg_lambda': 0.0002594066986743497, 'colsample_bytree': 0.76, 'num_leaves': 680, 'feature_fraction': 0.7459153020505629, 'bagging_fraction': 0.8176882672179887, 'bagging_freq': 9, 'min_child_samples': 174, 'subsample': 0.37, 'learning_rate': 0.02889421835618839, 'max_depth': 28, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 0.9606944002664333.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.00647013	valid_1's binary_logloss: 0.046266


[32m[I 2023-03-16 02:15:49,978][0m Trial 28 finished with value: 0.9602585862187791 and parameters: {'n_estimators': 821, 'reg_alpha': 0.032720688023965966, 'reg_lambda': 3.546858742086658e-05, 'colsample_bytree': 0.6000000000000001, 'num_leaves': 558, 'feature_fraction': 0.8893551343480259, 'bagging_fraction': 0.675754923702208, 'bagging_freq': 11, 'min_child_samples': 233, 'subsample': 0.17, 'learning_rate': 0.05811059531330798, 'max_depth': 16, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 0.9606944002664333.[0m


Early stopping, best iteration is:
[116]	training's binary_logloss: 0.0255911	valid_1's binary_logloss: 0.0376484
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.012353	valid_1's binary_logloss: 0.0395621
Did not meet early stopping. Best iteration is:
[822]	training's binary_logloss: 0.00458747	valid_1's binary_logloss: 0.0452473


[32m[I 2023-03-16 02:17:22,349][0m Trial 29 finished with value: 0.959625221795779 and parameters: {'n_estimators': 822, 'reg_alpha': 0.03753375807750508, 'reg_lambda': 2.0995924425752335e-05, 'colsample_bytree': 0.5900000000000001, 'num_leaves': 557, 'feature_fraction': 0.7997920370144915, 'bagging_fraction': 0.894135179103417, 'bagging_freq': 12, 'min_child_samples': 74, 'subsample': 0.17, 'learning_rate': 0.013074343181170976, 'max_depth': 41, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 0.9606944002664333.[0m
[32m[I 2023-03-16 02:17:22,362][0m A new study created in memory with name: maximize[0m


Number of finished trials: 30
Best LGBM trial parameters: {'n_estimators': 806, 'reg_alpha': 0.023907052180379338, 'reg_lambda': 0.00016067546536308253, 'colsample_bytree': 0.5800000000000001, 'num_leaves': 758, 'feature_fraction': 0.7009197470196303, 'bagging_fraction': 0.7257499940958625, 'bagging_freq': 13, 'min_child_samples': 181, 'subsample': 0.19, 'learning_rate': 0.029190899464656268, 'max_depth': 11, 'random_state': 42, 'n_jobs': 4}
Best score: 0.9606944002664333


[32m[I 2023-03-16 02:17:23,080][0m Trial 0 finished with value: 0.9572893144791251 and parameters: {'learning_rate': 0.36737134824754447, 'l2_leaf_reg': 8.372769467122085, 'bagging_temperature': 3.8299086098821835, 'random_strength': 1.1531156296609542, 'depth': 10, 'min_data_in_leaf': 297}. Best is trial 0 with value: 0.9572893144791251.[0m
[32m[I 2023-03-16 02:17:23,342][0m Trial 1 finished with value: 0.9556131443434206 and parameters: {'learning_rate': 0.7873576574082136, 'l2_leaf_reg': 15.270275985124844, 'bagging_temperature': 15.651085802486678, 'random_strength': 1.3560568005595162, 'depth': 2, 'min_data_in_leaf': 248}. Best is trial 0 with value: 0.9572893144791251.[0m
[32m[I 2023-03-16 02:17:23,690][0m Trial 2 finished with value: 0.957969408308556 and parameters: {'learning_rate': 0.6933283005454411, 'l2_leaf_reg': 2.3831877375898314, 'bagging_temperature': 6.11972834952258, 'random_strength': 1.511970622206116, 'depth': 5, 'min_data_in_leaf': 130}. Best is trial 2 w

Number of finished trials: 30
Best Cat trial parameters: {'learning_rate': 0.798379131552692, 'l2_leaf_reg': 1.8767133474987714, 'bagging_temperature': 0.22131574030831425, 'random_strength': 1.120955342078158, 'depth': 6, 'min_data_in_leaf': 85}
Best score: 0.9607637946414223
CPU times: user 1h 52min 35s, sys: 6min 19s, total: 1h 58min 55s
Wall time: 39min 5s


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [27]:
# train = create_folds(train, Config.N_FOLDS)
train = create_strat_folds(train, TARGET, Config.N_FOLDS)

TARGET=Class, n_folds=5, seed=42


In [28]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index(ID, inplace=True)
oof.head()

Unnamed: 0_level_0,Class,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0,1
1.0,0,3
2.0,0,4
3.0,0,3
4.0,0,2


In [29]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [30]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.log_loss(yvalid, preds_valid)
#         fold_score = metrics.mean_absolute_error(
#             yvalid, preds_valid
#         )
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
        fold_score = metrics.log_loss(yvalid.values, preds_valid)  # Validation Set Score
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score

#         show_classification_scores(yvalid.values, preds_valid_class)
#         fold_score = metrics.mean_absolute_error(
#             yvalid, preds_valid
#         )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [31]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [32]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 22 µs, sys: 0 ns, total: 22 µs
Wall time: 27.7 µs


In [33]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'objective': 'binary',
                'metric': 'binary_logloss', #'auc',
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params3 = {
    "n_estimators": Config.N_ESTIMATORS,
      'objective': 'binary',
#     'objective': 'regression',
      'metric': 'binary_logloss', #'auc',
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,

#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}
    
lgbm_params = gpu_ify_lgbm(lgbm_params)

In [34]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
#     "objective": "reg:squarederror",
       'eval_metric'     : 'logloss', #'auc',
       'objective'       : 'binary:logistic',    
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params3 = {
    'n_estimators': Config.N_ESTIMATORS,
       'eval_metric'     : 'logloss', #'auc',
       'objective'       : 'binary:logistic',
    'learning_rate': 0.05,
    'max_depth': 10,
    "seed": Config.seed,    
    'subsample': 0.8,
    'colsample_bytree': 0.8,
#     'objective': 'reg:squarederror'
}

xgb_params_logloss = {
    'n_estimators': Config.N_ESTIMATORS,
   'eval_metric'     : 'logloss', #'auc',
   'objective'       : 'binary:logistic',
    "seed": Config.seed,    
    'max_depth': 4,
    'learning_rate': 0.06,
    'colsample_bytree': 0.67,
    'n_jobs': -1,
    'objective': 'binary:logistic',
    'early_stopping_rounds': 150,
    'verbosity': 0,
#     'eval_metric': 'logloss'
}


if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [35]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)

In [36]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),
    "xgb3": xgb.XGBRegressor(**xgb_params3),
    
    "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm0": lgb.LGBMRegressor(),
    "lgbm3": lgb.LGBMRegressor(lgbm_params3),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),
}

model_estimator_dict = {
    "xgb1": xgb.XGBClassifier(**xgb_params),
    "xgb_best_params": xgb.XGBClassifier(**best_xgb_params),
    "xgb3": xgb.XGBClassifier(**xgb_params3),
    "xgb_params_logloss": xgb.XGBClassifier(**xgb_params_logloss),

    "lgbm1": lgb.LGBMClassifier(**lgbm_params),
    "lgbm_best_params": lgb.LGBMClassifier(**best_lgbm_params),
    "lgbm2": lgb.LGBMClassifier(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),

    #     "lgbm2": lgb.LGBMClassifier(**lgb_params_best_bsmith),
    #     "lgbm3": lgb.LGBMClassifier(**lgbm_params03),
#     "cat1": cb.CatBoostClassifier(**cb_params),
    "cat1": cb.CatBoostClassifier(),
    "cat2": cb.CatBoostClassifier(**cb_params),
    "cat_best_params": cb.CatBoostClassifier(**best_cb_params),

    #     "cat2": cb.CatBoostClassifier(**cb_params2),
    #     "cat3": cb.CatBoostClassifier(**cb_params3),
}

## Tree Models

In [37]:
%%time

# model_lst = ["xgb3","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
# model_lst = ["xgb_params_logloss","xgb_best_params", "lgbm_best_params", "cat_best_params","xgb3", "xgb1", "lgbm1", "lgbm2", "cat1", "cat2"]
model_lst = ["xgb1", "xgb_best_params"]
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=xgb1
{}
fold: 1, Score: 0.3416550415095285, Run Time: 17.86
fold: 2, Score: 0.40029704173663666, Run Time: 18.71
fold: 3, Score: 0.344217451336274, Run Time: 19.43
fold: 4, Score: 0.41051145554668816, Run Time: 19.75
fold: 5, Score: 0.3569661507290508, Run Time: 22.60
Scores -> Adjusted: 0.34176571 , mean: 0.37072943, std: 0.02896372

=== Model Feature Importance ===
EK 0.5629134
Skewness 0.20382312
Mean_Integrated 0.104443885
SD_DMSNR_Curve 0.03680985
Skewness_DMSNR_Curve 0.024203964
SD 0.024162265
Mean_DMSNR_Curve 0.024045467
EK_DMSNR_Curve 0.019598074


Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,0
1.0,0
2.0,0
3.0,0
4.0,0


Mode
=== Target Value Counts ===
Model Run Time: 101.21
Model=xgb_best_params
{}
fold: 1, Score: 0.3314563994117856, Run Time: 27.83
fold: 2, Score: 0.39009839963889387, Run Time: 26.25
fold: 3, Score: 0.32764417163981724, Run Time: 26.82
fold: 4, Score: 0.39521304578950905, Run Time: 28.27
fold: 5, Score: 0.35441655842125996, Run Time: 29.18
Scores -> Adjusted: 0.33134597 , mean: 0.35976571, std: 0.02841974

=== Model Feature Importance ===
EK 0.8414009
SD_DMSNR_Curve 0.03167882
Mean_DMSNR_Curve 0.02224032
Skewness 0.021759808
SD 0.021691611
Skewness_DMSNR_Curve 0.021599466
EK_DMSNR_Curve 0.02022381
Mean_Integrated 0.019405171


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0.0,0
1.0,0
2.0,0
3.0,0
4.0,0


Mode
=== Target Value Counts ===
Model Run Time: 140.75
CPU times: user 10min 55s, sys: 10.2 s, total: 11min 5s
Wall time: 4min 1s


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb1,0.37073,0.02896,101.21483
1,xgb_best_params,0.35977,0.02842,140.74902


## Linear Models

In [38]:
# model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
# model_lst = ["lasso", "ridge",  "ridge_50"]
model_lst = []
# all_cv_scores = run_models4features(model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

all_cv_scores.head()

Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb1,0.37073,0.02896,101.21483
1,xgb_best_params,0.35977,0.02842,140.74902


In [39]:
sample_submission.head(20)

Unnamed: 0,id,Class,target_xgb1,target_xgb_best_params
0,117564,0.5,0,0
1,117565,0.5,0,0
2,117566,0.5,0,0
3,117567,0.5,0,0
4,117568,0.5,0,0
5,117569,0.5,1,1
6,117570,0.5,0,0
7,117571,0.5,0,0
8,117572,0.5,0,0
9,117573,0.5,0,0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [40]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [41]:
len(model_lst)

0

In [42]:
def equal_wt_model(model_lst:List[str], fname:str) -> None:
    target_names = [f"target_{model}" for model in model_lst]
# target_names
    sample_submission[TARGET] = sample_submission[target_names].sum(axis=1) / len(model_lst)
    sample_submission[[ID, TARGET]].to_csv(fname, index=False)
    sample_submission[[ID, TARGET]].tail(8)

In [43]:
def wt_avg_model() -> None:
    sample_submission[TARGET] = (
    #     (sample_submission["target_xgb_bp"] * 2 )
    #     + (sample_submission["target_lgbm_bp"]  )
        (sample_submission["target_xgb1"] * 3 )
        + (sample_submission["target_lgbm1"])
    #     + (sample_submission["target_lgbm2"])    
    #     + (sample_submission["target_lgbm2"])
        + (sample_submission["target_cat1"] )
        + (sample_submission["target_cat2"] )    
    #     + (sample_submission["target_cat_bp"] )
    #     + (sample_submission["target_svc"] )
    #     + (sample_submission["target_log_reg3"] )
    #     + (sample_submission["target_cat2"] )
    )/6

    # sample_submission[TARGET] = sample_submission[TARGET].astype(int)
    sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
    sample_submission[[ID, TARGET]].tail(8)



In [44]:
if Config.ensemble_models:
    wt_avg_model()
    model_lst = ["xgb1", "cat1", "lgbm1"]
    equal_wt_model(model_lst, "submission_models_wt_avg.csv")

In [45]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb1,0.37073,0.02896,101.21483
1,xgb_best_params,0.35977,0.02842,140.74902


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [46]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [47]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [48]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    


In [49]:
if Config.ensemble_models:
    (oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict)
    display(oof_df.head())
    display(preds_df.head())

In [50]:
# type(preds_df)

In [51]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [52]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [53]:
# oof_df[useful_features].head()

In [54]:
# preds_df[useful_features].head()

In [55]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [56]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [57]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb1,0.37,0.03,101.21
1,xgb_best_params,0.36,0.03,140.75
