<a href="https://www.kaggle.com/code/mmellinger66/s3e9-concrete-strength-models?scriptVersionId=121568664" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 9 - Concrete Strength Models</h1>
</div>

## Problem Type

Regression

## Evaluation Metric

$$RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y_i})^2}$$

```python
score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
```

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="Strength"
ID="id"

# Optuna
objective_direction = "minimize"  # minimize, maximize

In [3]:
class Config:
    path:str = "../input/playground-series-s3e9/"
    load_original_data:bool = False # Some Competitions use synthetic data, based on real data
    original_data_path:str = "../input/predict-concrete-strength/ConcreteStrengthData.csv"
    gpu:bool = False
    optimize:bool = True
    n_optuna_trials:int = 30 # 5, 10, 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 100  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5
        

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    """Deprecated: Use summary_statistics()"""
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    if Config.gpu:
        lgbm_dict["device"] = "gpu"
        lgbm_dict["boosting_type"] = "gbdt"
        lgbm_dict["gpu_platform_id"] = 0
        lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    if Config.gpu:
        params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
#         "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),

        "eval_metric": "rmse",  # auc, rmse, mae
        "objective": "reg:squarederror", # Normal Distribution
#         "objective": "reg:gamma", # Gamma Distribution

        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["hist"]
        ),  # hist, gpu_hist
#         "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    #  model = CatBoostClassifier(**cb_params)
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=5407, Columns=10
 test data : Rows=3605, Columns=9
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19



[1m[91m=== Train Column Names ===[0m



Index(['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent',
       'WaterComponent', 'SuperplasticizerComponent',
       'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays',
       'Strength'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

AgeInDays                    2.74687
SuperplasticizerComponent    1.41169
FlyAshComponent              1.30469
BlastFurnaceSlag             1.12120
Strength                     0.38073
CementComponent              0.34128
id                           0.00000
CoarseAggregateComponent    -0.08145
WaterComponent              -0.21528
FineAggregateComponent      -0.44738
dtype: float64
CPU times: user 57.3 ms, sys: 6.21 ms, total: 63.5 ms
Wall time: 96.7 ms


In [8]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19


In [9]:
def load_original_data(path:str) -> pd.DataFrame:
    original = pd.read_csv(original_data_path, index_col=[0])
    original = original[-original.depth.isna()]
    print(f"Shape={original.shape}")
    original.head()

if Config.load_original_data:    
    original = load_original_data(Config.original_data_path)

In [10]:
if Config.load_original_data:
    train['is_original']    = 0
    test['is_original']     = 0
    original['is_original'] = 1
    combined = pd.concat([train, original], ignore_index=True).drop_duplicates()
    train = combined
    combined.head()

In [11]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
CementComponent,5407.0,299.17,105.54,102.0,213.7,297.2,375.0,540.0,11138.2,0.34,-0.55
BlastFurnaceSlag,5407.0,58.61,83.42,0.0,0.0,0.0,122.6,359.4,6958.53,1.12,0.0
FlyAshComponent,5407.0,31.87,54.61,0.0,0.0,0.0,79.0,200.1,2981.71,1.3,0.1
WaterComponent,5407.0,185.08,18.52,121.8,175.1,187.4,192.0,247.0,342.9,-0.22,0.84
SuperplasticizerComponent,5407.0,4.11,5.69,0.0,0.0,0.0,8.05,32.2,32.4,1.41,2.2
CoarseAggregateComponent,5407.0,992.0,77.15,801.0,938.2,978.0,1047.0,1145.0,5951.82,-0.08,-0.56
FineAggregateComponent,5407.0,771.22,78.73,594.0,734.3,781.2,821.0,992.6,6197.67,-0.45,-0.01
AgeInDays,5407.0,51.75,70.01,1.0,7.0,28.0,56.0,365.0,4900.98,2.75,8.27
Strength,5407.0,35.45,16.4,2.33,23.64,33.95,45.85,82.6,269.02,0.38,-0.36


## Outlier Detection

In [12]:
# https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
    
def iqr(data:pd.DataFrame, var:str):# outliers detecion .
    q1 = np.quantile(data[var], 0.25)
    q3 = np.quantile(data[var], 0.75)
    diff = q3 - q1
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    return data[(data[var] < lower_t) | (data[var] > upper_t)]

# iqr(train, "squareMeters")

In [13]:
# # https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy

# def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
#     outlier_percents = {}
#     for column in data.columns:
#         if data[column].dtype != object:
#             q1 = np.quantile(data[column], 0.25)
#             q3 = np.quantile(data[column], 0.75)
#             iqr = q3 - q1
#             upper_bound = q3 + (1.5 * iqr)
#             lower_bound = q1 - (1.5 * iqr)
#             outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
#             outlier_percentage = len(outliers) / len(data[column]) * 100
#             outlier_percents[column] = outlier_percentage
#             outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
#     return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

# detect_outliers(train)


In [14]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy
    
def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(test)

Unnamed: 0,Outlier_percentage
FineAggregateComponent,8.54369
WaterComponent,8.2663
AgeInDays,7.93343
SuperplasticizerComponent,1.47018
BlastFurnaceSlag,0.41609
id,0.0
CementComponent,0.0
FlyAshComponent,0.0
CoarseAggregateComponent,0.0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

## Handle Outliers
- https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
- https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

In [15]:
# features_with_outliers = []

In [16]:
# https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

def remove_outliers(df:pd.DataFrame) -> pd.DataFrame:
    for c in features_with_outliers:
        if c == 'garage':
            first_percentile = df[c].quantile(0.001)
            df = df[df[c] > first_percentile]

        ninety_ninth_percentile = df[c].quantile(0.999)
        df = df[df[c] < ninety_ninth_percentile]
        #df_t = df_t[(df_t[c] > first_percentile) & (df_t[c] < ninety_ninth_percentile)]
    return df


In [17]:
# print(f'Before: {len(train)}')
# train = remove_outliers(train)
# print(f'After: {len(train)}')

In [18]:
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19
5,5,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43
6,6,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1
7,7,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94
8,8,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14
9,9,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94


In [19]:
train = train.reset_index(drop=True).copy()
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19
5,5,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43
6,6,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1
7,7,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94
8,8,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14
9,9,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94


In [20]:
excluded_features = [TARGET, ID, "fold"]

In [21]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['CementComponent',
 'BlastFurnaceSlag',
 'FlyAshComponent',
 'WaterComponent',
 'SuperplasticizerComponent',
 'CoarseAggregateComponent',
 'FineAggregateComponent',
 'AgeInDays']

In [22]:
train, test = label_encoder(train, test, cat_features)
# train = pd.get_dummies(train,columns=[]) # Will remove original feature names
# test = pd.get_dummies(test,columns=[])

In [23]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19


In [24]:
# cont_features, cat_features = feature_distribution_types(train, display=True)
# show_cardinality(train, cat_features)

# cont_features = [feature for feature in cont_features if feature not in excluded_features]
# cat_features = [feature for feature in cat_features if feature not in excluded_features]

# FEATURES = cont_features + cat_features
# FEATURES

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [25]:
%%time

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
# best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best XGB trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best LGBM trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
# best_cb_params = {}
best_cb_params = {'learning_rate': 0.45743264601999495,
                  'l2_leaf_reg': 41.338946049390074,
                  'bagging_temperature': 0.3472567739474319,
                  'random_strength': 1.7332249677756242, 
                  'depth': 1,
                  'min_data_in_leaf': 6}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best Cat trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

[32m[I 2023-03-09 14:06:02,133][0m A new study created in memory with name: no-name-b0f8f09d-d8b4-4e69-a5c9-393164087337[0m
[32m[I 2023-03-09 14:06:24,569][0m Trial 0 finished with value: 11.851587476700484 and parameters: {'use_label_encoder': False, 'n_estimators': 2800, 'learning_rate': 0.03003418839652684, 'subsample': 0.5, 'colsample_bytree': 0.8300000000000001, 'max_depth': 15, 'gamma': 4.7, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 3.60297299312024e-06, 'reg_alpha': 0.7424561439734308, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 23.415941272339346}. Best is trial 0 with value: 11.851587476700484.[0m


Number of boosting rounds: 141


[32m[I 2023-03-09 14:06:41,506][0m Trial 1 finished with value: 12.0890318545615 and parameters: {'use_label_encoder': False, 'n_estimators': 2600, 'learning_rate': 0.1714134212097408, 'subsample': 0.66, 'colsample_bytree': 0.16999999999999998, 'max_depth': 4, 'gamma': 27.6, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 7.145637552083565e-08, 'reg_alpha': 0.28943329881571833, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 8.524518432425804}. Best is trial 0 with value: 11.851587476700484.[0m


Number of boosting rounds: 238


[32m[I 2023-03-09 14:07:06,857][0m Trial 2 finished with value: 11.83551137687714 and parameters: {'use_label_encoder': False, 'n_estimators': 4200, 'learning_rate': 0.017259290349559805, 'subsample': 0.71, 'colsample_bytree': 0.27, 'max_depth': 2, 'gamma': 78.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 4.415361823868192, 'reg_alpha': 21.07969727624985, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.49461487808481913}. Best is trial 2 with value: 11.83551137687714.[0m


Number of boosting rounds: 1963


[32m[I 2023-03-09 14:07:35,171][0m Trial 3 finished with value: 11.835314542403435 and parameters: {'use_label_encoder': False, 'n_estimators': 4100, 'learning_rate': 0.0764339052316334, 'subsample': 0.33999999999999997, 'colsample_bytree': 0.89, 'max_depth': 5, 'gamma': 56.300000000000004, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.0010676150371551423, 'reg_alpha': 16.912553356073264, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.6042091855624282}. Best is trial 3 with value: 11.835314542403435.[0m


Number of boosting rounds: 58


[32m[I 2023-03-09 14:08:08,514][0m Trial 4 finished with value: 12.316746683817414 and parameters: {'use_label_encoder': False, 'n_estimators': 3700, 'learning_rate': 0.022436726580736404, 'subsample': 0.45000000000000007, 'colsample_bytree': 0.5700000000000001, 'max_depth': 19, 'gamma': 12.700000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.14437866923425566, 'reg_alpha': 3.19843178049245e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 3.0403330321152233}. Best is trial 3 with value: 11.835314542403435.[0m


Number of boosting rounds: 208


[32m[I 2023-03-09 14:08:24,861][0m Trial 5 finished with value: 38.07693772976177 and parameters: {'use_label_encoder': False, 'n_estimators': 2700, 'learning_rate': 0.23098552881780854, 'subsample': 0.17, 'colsample_bytree': 0.42, 'max_depth': 19, 'gamma': 52.300000000000004, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.683641308841734, 'reg_alpha': 0.08046889604753274, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 936.0649571268101}. Best is trial 3 with value: 11.835314542403435.[0m


Number of boosting rounds: 0


[32m[I 2023-03-09 14:08:44,767][0m Trial 6 finished with value: 11.979806016229617 and parameters: {'use_label_encoder': False, 'n_estimators': 2900, 'learning_rate': 0.059201007504632606, 'subsample': 0.76, 'colsample_bytree': 0.8300000000000001, 'max_depth': 7, 'gamma': 69.9, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.05036006233264888, 'reg_alpha': 1.9059755063001657e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.1686996975115254}. Best is trial 3 with value: 11.835314542403435.[0m


Number of boosting rounds: 62


[32m[I 2023-03-09 14:09:07,660][0m Trial 7 finished with value: 11.84068355692924 and parameters: {'use_label_encoder': False, 'n_estimators': 3900, 'learning_rate': 0.18975690125281924, 'subsample': 0.98, 'colsample_bytree': 0.51, 'max_depth': 3, 'gamma': 49.300000000000004, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 6.124220645163113e-08, 'reg_alpha': 0.26766907053545463, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 4.313224168511583}. Best is trial 3 with value: 11.835314542403435.[0m


Number of boosting rounds: 62


[32m[I 2023-03-09 14:09:43,361][0m Trial 8 finished with value: 12.433197718785628 and parameters: {'use_label_encoder': False, 'n_estimators': 5000, 'learning_rate': 0.08557204194029286, 'subsample': 0.7, 'colsample_bytree': 0.09, 'max_depth': 16, 'gamma': 20.1, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.4135904091181625e-07, 'reg_alpha': 1.7183405901609838, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 6.995967447925987}. Best is trial 3 with value: 11.835314542403435.[0m


Number of boosting rounds: 405


[32m[I 2023-03-09 14:10:01,762][0m Trial 9 finished with value: 11.752559504447502 and parameters: {'use_label_encoder': False, 'n_estimators': 2700, 'learning_rate': 0.09606814397730487, 'subsample': 0.75, 'colsample_bytree': 0.7400000000000001, 'max_depth': 7, 'gamma': 93.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 47.91949078792177, 'reg_alpha': 1.5709320929830089e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 137.36053551865243}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 79


[32m[I 2023-03-09 14:10:10,105][0m Trial 10 finished with value: 11.784067516699913 and parameters: {'use_label_encoder': False, 'n_estimators': 1300, 'learning_rate': 0.036463634827560244, 'subsample': 0.99, 'colsample_bytree': 0.68, 'max_depth': 10, 'gamma': 89.9, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 70.18093917582297, 'reg_alpha': 1.3419142786990587e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 109.55399596061868}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 209


[32m[I 2023-03-09 14:10:17,676][0m Trial 11 finished with value: 11.791858284064771 and parameters: {'use_label_encoder': False, 'n_estimators': 1200, 'learning_rate': 0.03618414148155403, 'subsample': 0.96, 'colsample_bytree': 0.64, 'max_depth': 9, 'gamma': 98.5, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 30.887437622603017, 'reg_alpha': 2.0116833787652186e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 128.593645933401}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 159


[32m[I 2023-03-09 14:10:28,646][0m Trial 12 finished with value: 11.781895208463725 and parameters: {'use_label_encoder': False, 'n_estimators': 1400, 'learning_rate': 0.01029185864405986, 'subsample': 0.85, 'colsample_bytree': 1.0, 'max_depth': 12, 'gamma': 98.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 74.6552897749594, 'reg_alpha': 1.709090438512561e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 97.01962547753473}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 589


[32m[I 2023-03-09 14:10:42,342][0m Trial 13 finished with value: 11.781512469180198 and parameters: {'use_label_encoder': False, 'n_estimators': 2000, 'learning_rate': 0.012906091946173004, 'subsample': 0.85, 'colsample_bytree': 0.9800000000000001, 'max_depth': 13, 'gamma': 99.10000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 45.49110390117457, 'reg_alpha': 5.221744863098077e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 69.29291581887594}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 447


[32m[I 2023-03-09 14:10:58,154][0m Trial 14 finished with value: 11.852993178444127 and parameters: {'use_label_encoder': False, 'n_estimators': 2000, 'learning_rate': 0.010209769691940374, 'subsample': 0.83, 'colsample_bytree': 1.0, 'max_depth': 13, 'gamma': 78.30000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.5202810287928051, 'reg_alpha': 9.212013324455923e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 33.50592385812763}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 372


[32m[I 2023-03-09 14:11:10,394][0m Trial 15 finished with value: 11.96919979637357 and parameters: {'use_label_encoder': False, 'n_estimators': 2000, 'learning_rate': 0.10571241489427102, 'subsample': 0.5700000000000001, 'colsample_bytree': 0.7500000000000001, 'max_depth': 7, 'gamma': 85.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.003881426956996623, 'reg_alpha': 0.004590756662838092, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 435.93323072871266}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 219


[32m[I 2023-03-09 14:11:24,854][0m Trial 16 finished with value: 11.766060915369637 and parameters: {'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.04874461283885389, 'subsample': 0.86, 'colsample_bytree': 0.9, 'max_depth': 15, 'gamma': 64.60000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 3.366965043891646, 'reg_alpha': 1.0863702531262558e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 249.14984647819347}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 104


[32m[I 2023-03-09 14:11:45,930][0m Trial 17 finished with value: 11.930767998776476 and parameters: {'use_label_encoder': False, 'n_estimators': 3400, 'learning_rate': 0.050988155795797546, 'subsample': 0.6, 'colsample_bytree': 0.7500000000000001, 'max_depth': 16, 'gamma': 39.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 1.1022544514518395, 'reg_alpha': 0.000370931893629338, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 352.0913500422466}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 162


[32m[I 2023-03-09 14:12:00,933][0m Trial 18 finished with value: 11.969009375418969 and parameters: {'use_label_encoder': False, 'n_estimators': 2300, 'learning_rate': 0.11882637724975868, 'subsample': 0.35, 'colsample_bytree': 0.42, 'max_depth': 8, 'gamma': 66.5, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.018740764979926664, 'reg_alpha': 1.2201914747927462e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 271.34564475439066}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 219


[32m[I 2023-03-09 14:12:20,301][0m Trial 19 finished with value: 12.058141846386334 and parameters: {'use_label_encoder': False, 'n_estimators': 3300, 'learning_rate': 0.05589871315315912, 'subsample': 0.78, 'colsample_bytree': 0.7200000000000001, 'max_depth': 1, 'gamma': 63.400000000000006, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 5.482986780047665, 'reg_alpha': 2.1016390150555004e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 912.9074121250435}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 954


[32m[I 2023-03-09 14:12:32,061][0m Trial 20 finished with value: 11.873034222224897 and parameters: {'use_label_encoder': False, 'n_estimators': 1700, 'learning_rate': 0.12957668413281406, 'subsample': 0.91, 'colsample_bytree': 0.8800000000000001, 'max_depth': 11, 'gamma': 40.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.3582085723676739, 'reg_alpha': 2.2867458463410416e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 30.280669609510355}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 26


[32m[I 2023-03-09 14:12:44,875][0m Trial 21 finished with value: 11.812943288365464 and parameters: {'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.0716176921277993, 'subsample': 0.87, 'colsample_bytree': 0.92, 'max_depth': 14, 'gamma': 89.5, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 13.157120391580134, 'reg_alpha': 2.541865013922371e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 68.7679171091423}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 64


[32m[I 2023-03-09 14:13:01,289][0m Trial 22 finished with value: 11.753694301231352 and parameters: {'use_label_encoder': False, 'n_estimators': 2400, 'learning_rate': 0.04084890085638608, 'subsample': 0.76, 'colsample_bytree': 0.9600000000000001, 'max_depth': 18, 'gamma': 78.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 16.964756294496723, 'reg_alpha': 4.169569976711264e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 200.41319038437823}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 166


[32m[I 2023-03-09 14:13:16,815][0m Trial 23 finished with value: 11.783835602279678 and parameters: {'use_label_encoder': False, 'n_estimators': 2400, 'learning_rate': 0.04229868013187092, 'subsample': 0.64, 'colsample_bytree': 0.8200000000000001, 'max_depth': 18, 'gamma': 77.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 4.68972813379816, 'reg_alpha': 1.2816690936747007e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 211.2378375909627}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 190


[32m[I 2023-03-09 14:13:37,501][0m Trial 24 finished with value: 11.96091318661333 and parameters: {'use_label_encoder': False, 'n_estimators': 3200, 'learning_rate': 0.048839910377481574, 'subsample': 0.78, 'colsample_bytree': 0.6200000000000001, 'max_depth': 17, 'gamma': 72.4, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.43833902633341076, 'reg_alpha': 4.488253785145492e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 472.43165038722833}. Best is trial 9 with value: 11.752559504447502.[0m


Number of boosting rounds: 444


[32m[I 2023-03-09 14:13:48,972][0m Trial 25 finished with value: 11.7509805567806 and parameters: {'use_label_encoder': False, 'n_estimators': 1700, 'learning_rate': 0.026583382129172623, 'subsample': 0.72, 'colsample_bytree': 0.93, 'max_depth': 20, 'gamma': 85.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 10.764022572340519, 'reg_alpha': 0.00033386385954958987, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 172.0839142630924}. Best is trial 25 with value: 11.7509805567806.[0m


Number of boosting rounds: 200


[32m[I 2023-03-09 14:13:56,108][0m Trial 26 finished with value: 11.787882583491184 and parameters: {'use_label_encoder': False, 'n_estimators': 1000, 'learning_rate': 0.027428056592348454, 'subsample': 0.72, 'colsample_bytree': 0.8, 'max_depth': 19, 'gamma': 87.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 14.748375052902537, 'reg_alpha': 9.255121773586997e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 47.35104620940159}. Best is trial 25 with value: 11.7509805567806.[0m


Number of boosting rounds: 194


[32m[I 2023-03-09 14:14:08,155][0m Trial 27 finished with value: 11.7989985882998 and parameters: {'use_label_encoder': False, 'n_estimators': 1700, 'learning_rate': 0.022553456176037774, 'subsample': 0.52, 'colsample_bytree': 0.9400000000000001, 'max_depth': 20, 'gamma': 82.5, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 82.47028298256024, 'reg_alpha': 0.0017926464604304992, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 166.599975876115}. Best is trial 25 with value: 11.7509805567806.[0m


Number of boosting rounds: 388


[32m[I 2023-03-09 14:14:24,350][0m Trial 28 finished with value: 11.789500204098502 and parameters: {'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.06364229340283035, 'subsample': 0.63, 'colsample_bytree': 0.78, 'max_depth': 20, 'gamma': 91.4, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.07891870554651519, 'reg_alpha': 0.0002083696549433865, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 163.53966130538416}. Best is trial 25 with value: 11.7509805567806.[0m


Number of boosting rounds: 136


[32m[I 2023-03-09 14:14:44,715][0m Trial 29 finished with value: 11.787401468686179 and parameters: {'use_label_encoder': False, 'n_estimators': 3000, 'learning_rate': 0.032096726541319336, 'subsample': 0.44000000000000006, 'colsample_bytree': 0.8600000000000001, 'max_depth': 5, 'gamma': 58.300000000000004, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 7.948874376534874e-05, 'reg_alpha': 4.627100749192583e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 15.332751973120537}. Best is trial 25 with value: 11.7509805567806.[0m
[32m[I 2023-03-09 14:14:44,729][0m A new study created in memory with name: no-name-0610f14c-ab88-44c4-851f-15212a785b32[0m


Number of boosting rounds: 156
Number of finished trials: 30
Best XGB trial parameters: {'use_label_encoder': False, 'n_estimators': 1700, 'learning_rate': 0.026583382129172623, 'subsample': 0.72, 'colsample_bytree': 0.93, 'max_depth': 20, 'gamma': 85.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 10.764022572340519, 'reg_alpha': 0.00033386385954958987, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 172.0839142630924}
Best score: 11.7509805567806
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 7.05779	training's rmse: 9.34192	valid_1's l1: 9.72237	valid_1's rmse: 12.4315


[32m[I 2023-03-09 14:14:51,494][0m Trial 0 finished with value: 12.02815578019798 and parameters: {'objective': 'rmse', 'n_estimators': 1798, 'reg_alpha': 1.9085608751790307e-08, 'reg_lambda': 8.926513370487635, 'colsample_bytree': 0.1, 'num_leaves': 714, 'feature_fraction': 0.20878060653622138, 'bagging_fraction': 0.9520645837645765, 'bagging_freq': 0, 'min_child_samples': 49, 'subsample': 0.89, 'learning_rate': 0.16251399117281135, 'max_depth': 96, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 12.02815578019798.[0m


Early stopping, best iteration is:
[77]	training's l1: 8.4688	training's rmse: 10.9556	valid_1's l1: 9.46558	valid_1's rmse: 12.0282
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 6.85259	valid_1's l1: 9.77508


[32m[I 2023-03-09 14:14:56,842][0m Trial 1 finished with value: 12.165006635881902 and parameters: {'objective': 'mae', 'n_estimators': 4703, 'reg_alpha': 1.8551535115822015e-08, 'reg_lambda': 0.01921771727303322, 'colsample_bytree': 0.11, 'num_leaves': 417, 'feature_fraction': 0.38187081919135746, 'bagging_fraction': 0.39846633415972477, 'bagging_freq': 8, 'min_child_samples': 31, 'subsample': 0.77, 'learning_rate': 0.23624859530994705, 'max_depth': 77, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 12.02815578019798.[0m


Early stopping, best iteration is:
[33]	training's l1: 8.59485	valid_1's l1: 9.38923
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.35037	valid_1's l1: 9.33536


[32m[I 2023-03-09 14:14:58,107][0m Trial 2 finished with value: 12.032660804372291 and parameters: {'objective': 'mae', 'n_estimators': 2089, 'reg_alpha': 6.267924114706836, 'reg_lambda': 7.577791189556983e-08, 'colsample_bytree': 0.43, 'num_leaves': 315, 'feature_fraction': 0.4891279841565984, 'bagging_fraction': 0.18689673433947718, 'bagging_freq': 6, 'min_child_samples': 231, 'subsample': 0.33, 'learning_rate': 0.12355179369847026, 'max_depth': 74, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 12.02815578019798.[0m


Early stopping, best iteration is:
[250]	training's l1: 9.42369	valid_1's l1: 9.30465
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.20403	valid_1's l1: 9.25207


[32m[I 2023-03-09 14:14:59,995][0m Trial 3 finished with value: 11.987782156923503 and parameters: {'objective': 'mae', 'n_estimators': 2706, 'reg_alpha': 5.946839154051634e-07, 'reg_lambda': 1.0916627101450757e-06, 'colsample_bytree': 0.1, 'num_leaves': 662, 'feature_fraction': 0.6629526182915046, 'bagging_fraction': 0.3244253837008844, 'bagging_freq': 2, 'min_child_samples': 236, 'subsample': 0.71, 'learning_rate': 0.025737432269812774, 'max_depth': 39, 'random_state': 42, 'n_jobs': 4}. Best is trial 3 with value: 11.987782156923503.[0m


Early stopping, best iteration is:
[295]	training's l1: 9.28925	valid_1's l1: 9.23424
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.80138	training's rmse: 11.4454	valid_1's l1: 9.3167	valid_1's rmse: 11.8753


[32m[I 2023-03-09 14:15:04,414][0m Trial 4 finished with value: 11.810295273292537 and parameters: {'objective': 'rmse', 'n_estimators': 3906, 'reg_alpha': 0.00033326905954209513, 'reg_lambda': 0.6128142609943305, 'colsample_bytree': 0.42, 'num_leaves': 854, 'feature_fraction': 0.8854701560733099, 'bagging_fraction': 0.611216118161131, 'bagging_freq': 10, 'min_child_samples': 115, 'subsample': 0.98, 'learning_rate': 0.018089780544579295, 'max_depth': 10, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 11.810295273292537.[0m


Early stopping, best iteration is:
[298]	training's l1: 8.96879	training's rmse: 11.6465	valid_1's l1: 9.24638	valid_1's rmse: 11.8103
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 5.93924	valid_1's l1: 9.63144


[32m[I 2023-03-09 14:15:47,507][0m Trial 5 finished with value: 12.008575654090144 and parameters: {'objective': 'mae', 'n_estimators': 3630, 'reg_alpha': 0.062049403047582455, 'reg_lambda': 0.0015138715353808715, 'colsample_bytree': 0.8600000000000001, 'num_leaves': 278, 'feature_fraction': 0.9832931515771172, 'bagging_fraction': 0.8453936374466992, 'bagging_freq': 9, 'min_child_samples': 9, 'subsample': 0.25, 'learning_rate': 0.01777157087878067, 'max_depth': 42, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 11.810295273292537.[0m


Early stopping, best iteration is:
[129]	training's l1: 7.31447	valid_1's l1: 9.42849
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.19126	valid_1's l1: 9.29737
[1000]	training's l1: 7.83502	valid_1's l1: 9.30616


[32m[I 2023-03-09 14:16:04,716][0m Trial 6 finished with value: 11.986764470242052 and parameters: {'objective': 'mae', 'n_estimators': 4684, 'reg_alpha': 0.0027369338771402354, 'reg_lambda': 1.0427633818346825e-05, 'colsample_bytree': 0.6200000000000001, 'num_leaves': 428, 'feature_fraction': 0.2693103722490542, 'bagging_fraction': 0.8227286644966991, 'bagging_freq': 3, 'min_child_samples': 50, 'subsample': 0.84, 'learning_rate': 0.033263231504328354, 'max_depth': 45, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 11.810295273292537.[0m


Early stopping, best iteration is:
[630]	training's l1: 8.04653	valid_1's l1: 9.27941
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.26878	valid_1's l1: 9.84227
[1000]	training's l1: 8.65819	valid_1's l1: 9.37907
[1500]	training's l1: 8.41554	valid_1's l1: 9.26316
[2000]	training's l1: 8.29176	valid_1's l1: 9.25257
[2500]	training's l1: 8.21099	valid_1's l1: 9.25014
Early stopping, best iteration is:
[2295]	training's l1: 8.23938	valid_1's l1: 9.24879


[32m[I 2023-03-09 14:16:24,753][0m Trial 7 finished with value: 11.97621960499265 and parameters: {'objective': 'mae', 'n_estimators': 3322, 'reg_alpha': 1.590089950702624e-07, 'reg_lambda': 0.00034629155413754014, 'colsample_bytree': 0.16999999999999998, 'num_leaves': 800, 'feature_fraction': 0.27904843939111074, 'bagging_fraction': 0.9139901923988982, 'bagging_freq': 1, 'min_child_samples': 81, 'subsample': 0.83, 'learning_rate': 0.012452672425502487, 'max_depth': 9, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 11.810295273292537.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 7.05618	valid_1's l1: 9.68267


[32m[I 2023-03-09 14:16:30,272][0m Trial 8 finished with value: 11.918483072612485 and parameters: {'objective': 'mae', 'n_estimators': 3764, 'reg_alpha': 0.000477347588999238, 'reg_lambda': 0.05720953111281878, 'colsample_bytree': 0.8700000000000001, 'num_leaves': 649, 'feature_fraction': 0.8809872641595673, 'bagging_fraction': 0.3752235341092639, 'bagging_freq': 8, 'min_child_samples': 35, 'subsample': 0.47, 'learning_rate': 0.1448873506825007, 'max_depth': 19, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 11.810295273292537.[0m


Early stopping, best iteration is:
[31]	training's l1: 8.70706	valid_1's l1: 9.17061
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 5.06782	valid_1's l1: 9.9253


[32m[I 2023-03-09 14:17:06,924][0m Trial 9 finished with value: 12.187396412966558 and parameters: {'objective': 'mae', 'n_estimators': 4525, 'reg_alpha': 2.7113615496519945e-05, 'reg_lambda': 8.404070029466674e-08, 'colsample_bytree': 0.41, 'num_leaves': 471, 'feature_fraction': 0.7085595650568052, 'bagging_fraction': 0.6911976510923408, 'bagging_freq': 10, 'min_child_samples': 8, 'subsample': 0.29000000000000004, 'learning_rate': 0.07193872944810122, 'max_depth': 45, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 11.810295273292537.[0m


Early stopping, best iteration is:
[44]	training's l1: 7.30105	valid_1's l1: 9.5557
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 10.3682	training's rmse: 13.1735	valid_1's l1: 10.4902	valid_1's rmse: 13.1271
[1000]	training's l1: 9.57436	training's rmse: 12.2985	valid_1's l1: 9.71352	valid_1's rmse: 12.2912


[32m[I 2023-03-09 14:17:11,378][0m Trial 10 finished with value: 12.096658256998731 and parameters: {'objective': 'rmse', 'n_estimators': 1311, 'reg_alpha': 4.854422626881078e-05, 'reg_lambda': 7.887162689705155, 'colsample_bytree': 0.64, 'num_leaves': 1000, 'feature_fraction': 0.10330557969293208, 'bagging_fraction': 0.5937753647138042, 'bagging_freq': 15, 'min_child_samples': 147, 'subsample': 0.59, 'learning_rate': 0.010368436819235724, 'max_depth': 23, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 11.810295273292537.[0m


Did not meet early stopping. Best iteration is:
[1311]	training's l1: 9.37041	training's rmse: 12.0752	valid_1's l1: 9.54605	valid_1's rmse: 12.0967
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.69331	training's rmse: 11.3016	valid_1's l1: 9.43671	valid_1's rmse: 12.0629


[32m[I 2023-03-09 14:17:13,844][0m Trial 11 finished with value: 11.814249059295536 and parameters: {'objective': 'rmse', 'n_estimators': 3814, 'reg_alpha': 0.0009314371313486148, 'reg_lambda': 0.11577441226608164, 'colsample_bytree': 1.0, 'num_leaves': 929, 'feature_fraction': 0.9419241516771741, 'bagging_fraction': 0.4832579040805594, 'bagging_freq': 12, 'min_child_samples': 126, 'subsample': 0.47, 'learning_rate': 0.05745486190990423, 'max_depth': 6, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 11.810295273292537.[0m


Early stopping, best iteration is:
[65]	training's l1: 9.2631	training's rmse: 11.9825	valid_1's l1: 9.23257	valid_1's rmse: 11.8142
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.10513	training's rmse: 11.8364	valid_1's l1: 9.24836	valid_1's rmse: 11.8452


[32m[I 2023-03-09 14:17:15,421][0m Trial 12 finished with value: 11.81705527662424 and parameters: {'objective': 'rmse', 'n_estimators': 3919, 'reg_alpha': 0.01317978580868753, 'reg_lambda': 0.17263919067311395, 'colsample_bytree': 0.9600000000000001, 'num_leaves': 983, 'feature_fraction': 0.867899640133249, 'bagging_fraction': 0.5475436284629154, 'bagging_freq': 13, 'min_child_samples': 130, 'subsample': 1.0, 'learning_rate': 0.052017822961170336, 'max_depth': 2, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 11.810295273292537.[0m


Early stopping, best iteration is:
[351]	training's l1: 9.15042	training's rmse: 11.9088	valid_1's l1: 9.20275	valid_1's rmse: 11.8171
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.91317	training's rmse: 11.5901	valid_1's l1: 9.29031	valid_1's rmse: 11.8499


[32m[I 2023-03-09 14:17:17,871][0m Trial 13 finished with value: 11.77682085910787 and parameters: {'objective': 'rmse', 'n_estimators': 2868, 'reg_alpha': 1.4266413904116099e-05, 'reg_lambda': 0.4333846484572322, 'colsample_bytree': 0.31, 'num_leaves': 73, 'feature_fraction': 0.9916534837202523, 'bagging_fraction': 0.533427663303321, 'bagging_freq': 12, 'min_child_samples': 180, 'subsample': 0.48, 'learning_rate': 0.032671626300517544, 'max_depth': 24, 'random_state': 42, 'n_jobs': 4}. Best is trial 13 with value: 11.77682085910787.[0m


Early stopping, best iteration is:
[200]	training's l1: 9.1093	training's rmse: 11.8442	valid_1's l1: 9.19177	valid_1's rmse: 11.7768
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.83432	training's rmse: 11.488	valid_1's l1: 9.27307	valid_1's rmse: 11.8429


[32m[I 2023-03-09 14:17:20,682][0m Trial 14 finished with value: 11.778394769498139 and parameters: {'objective': 'rmse', 'n_estimators': 2962, 'reg_alpha': 1.193557701702041e-05, 'reg_lambda': 0.9191287837461071, 'colsample_bytree': 0.27, 'num_leaves': 32, 'feature_fraction': 0.7923371776840348, 'bagging_fraction': 0.703733943653242, 'bagging_freq': 12, 'min_child_samples': 178, 'subsample': 0.14, 'learning_rate': 0.026958099572810052, 'max_depth': 27, 'random_state': 42, 'n_jobs': 4}. Best is trial 13 with value: 11.77682085910787.[0m


Early stopping, best iteration is:
[191]	training's l1: 9.09608	training's rmse: 11.8069	valid_1's l1: 9.2067	valid_1's rmse: 11.7784
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.13919	training's rmse: 11.8562	valid_1's l1: 9.26655	valid_1's rmse: 11.84


[32m[I 2023-03-09 14:17:22,482][0m Trial 15 finished with value: 11.798479821741182 and parameters: {'objective': 'rmse', 'n_estimators': 2726, 'reg_alpha': 4.993606799656694e-06, 'reg_lambda': 0.0037076198684627285, 'colsample_bytree': 0.26, 'num_leaves': 4, 'feature_fraction': 0.7581046958977714, 'bagging_fraction': 0.7054430070649886, 'bagging_freq': 15, 'min_child_samples': 189, 'subsample': 0.1, 'learning_rate': 0.03329818642973715, 'max_depth': 27, 'random_state': 42, 'n_jobs': 4}. Best is trial 13 with value: 11.77682085910787.[0m


Early stopping, best iteration is:
[344]	training's l1: 9.17035	training's rmse: 11.9291	valid_1's l1: 9.2158	valid_1's rmse: 11.7985
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.96036	training's rmse: 11.6509	valid_1's l1: 9.23723	valid_1's rmse: 11.7918


[32m[I 2023-03-09 14:17:24,297][0m Trial 16 finished with value: 11.75806473547123 and parameters: {'objective': 'rmse', 'n_estimators': 2136, 'reg_alpha': 2.4126280499148727e-06, 'reg_lambda': 0.9825528920012386, 'colsample_bytree': 0.28, 'num_leaves': 16, 'feature_fraction': 0.9876820587955093, 'bagging_fraction': 0.7183120564251952, 'bagging_freq': 12, 'min_child_samples': 293, 'subsample': 0.12000000000000001, 'learning_rate': 0.03676305450855212, 'max_depth': 60, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[144]	training's l1: 9.20484	training's rmse: 11.9405	valid_1's l1: 9.17394	valid_1's rmse: 11.7581
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.12656	training's rmse: 11.7758	valid_1's l1: 9.4711	valid_1's rmse: 12.0706


[32m[I 2023-03-09 14:17:25,955][0m Trial 17 finished with value: 11.938279433632575 and parameters: {'objective': 'rmse', 'n_estimators': 2206, 'reg_alpha': 9.401853887819864e-07, 'reg_lambda': 0.009763904137589981, 'colsample_bytree': 0.29, 'num_leaves': 136, 'feature_fraction': 0.9678051560118597, 'bagging_fraction': 0.49233127879505373, 'bagging_freq': 5, 'min_child_samples': 292, 'subsample': 0.63, 'learning_rate': 0.0817960502729469, 'max_depth': 63, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[215]	training's l1: 9.24181	training's rmse: 11.9595	valid_1's l1: 9.37218	valid_1's rmse: 11.9383
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.96264	training's rmse: 11.6393	valid_1's l1: 9.28188	valid_1's rmse: 11.8348


[32m[I 2023-03-09 14:17:28,075][0m Trial 18 finished with value: 11.789297356106543 and parameters: {'objective': 'rmse', 'n_estimators': 747, 'reg_alpha': 2.7128929407107315e-06, 'reg_lambda': 0.5689054131635087, 'colsample_bytree': 0.53, 'num_leaves': 232, 'feature_fraction': 0.5622087830339986, 'bagging_fraction': 0.8022594607027989, 'bagging_freq': 13, 'min_child_samples': 297, 'subsample': 0.41000000000000003, 'learning_rate': 0.04040387248235547, 'max_depth': 56, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[156]	training's l1: 9.19921	training's rmse: 11.9269	valid_1's l1: 9.21635	valid_1's rmse: 11.7893
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.883	training's rmse: 11.5297	valid_1's l1: 9.27604	valid_1's rmse: 11.8271


[32m[I 2023-03-09 14:17:30,205][0m Trial 19 finished with value: 11.78295624056928 and parameters: {'objective': 'rmse', 'n_estimators': 1547, 'reg_alpha': 6.736954856876336e-05, 'reg_lambda': 5.469969513146517e-05, 'colsample_bytree': 0.33, 'num_leaves': 160, 'feature_fraction': 0.9965879095204401, 'bagging_fraction': 0.6849959087715414, 'bagging_freq': 11, 'min_child_samples': 260, 'subsample': 0.2, 'learning_rate': 0.045446281889459934, 'max_depth': 68, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[132]	training's l1: 9.17748	training's rmse: 11.8998	valid_1's l1: 9.18228	valid_1's rmse: 11.783
Training until validation scores don't improve for 500 rounds


[32m[I 2023-03-09 14:17:31,440][0m Trial 20 finished with value: 12.015954610136797 and parameters: {'objective': 'rmse', 'n_estimators': 2377, 'reg_alpha': 2.2768588235908566e-07, 'reg_lambda': 0.008757650130279961, 'colsample_bytree': 0.2, 'num_leaves': 127, 'feature_fraction': 0.7988330883042422, 'bagging_fraction': 0.24611042994143023, 'bagging_freq': 6, 'min_child_samples': 200, 'subsample': 0.39, 'learning_rate': 0.07427147035826014, 'max_depth': 87, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


[500]	training's l1: 9.26596	training's rmse: 11.9808	valid_1's l1: 9.5712	valid_1's rmse: 12.1282
Early stopping, best iteration is:
[102]	training's l1: 9.49368	training's rmse: 12.2472	valid_1's l1: 9.39606	valid_1's rmse: 12.016
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.83515	training's rmse: 11.484	valid_1's l1: 9.30042	valid_1's rmse: 11.8526


[32m[I 2023-03-09 14:17:33,939][0m Trial 21 finished with value: 11.782633207192552 and parameters: {'objective': 'rmse', 'n_estimators': 3238, 'reg_alpha': 3.121273391886688e-05, 'reg_lambda': 0.7420700201481897, 'colsample_bytree': 0.36, 'num_leaves': 30, 'feature_fraction': 0.8234907951725574, 'bagging_fraction': 0.7181460458072143, 'bagging_freq': 13, 'min_child_samples': 179, 'subsample': 0.11, 'learning_rate': 0.0261309768232691, 'max_depth': 32, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[192]	training's l1: 9.08204	training's rmse: 11.8004	valid_1's l1: 9.20505	valid_1's rmse: 11.7826
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.86433	training's rmse: 11.5214	valid_1's l1: 9.29115	valid_1's rmse: 11.8791


[32m[I 2023-03-09 14:17:36,618][0m Trial 22 finished with value: 11.789050411258993 and parameters: {'objective': 'rmse', 'n_estimators': 3047, 'reg_alpha': 6.1261444803953315e-06, 'reg_lambda': 1.7014948367300375, 'colsample_bytree': 0.22000000000000003, 'num_leaves': 69, 'feature_fraction': 0.8792492589030425, 'bagging_fraction': 0.6185890607149485, 'bagging_freq': 11, 'min_child_samples': 165, 'subsample': 0.2, 'learning_rate': 0.02623274030490553, 'max_depth': 56, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[188]	training's l1: 9.11437	training's rmse: 11.8266	valid_1's l1: 9.21281	valid_1's rmse: 11.7891
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.77778	training's rmse: 11.407	valid_1's l1: 9.33958	valid_1's rmse: 11.9357


[32m[I 2023-03-09 14:17:39,368][0m Trial 23 finished with value: 11.783856394680184 and parameters: {'objective': 'rmse', 'n_estimators': 2595, 'reg_alpha': 6.570233172372166e-06, 'reg_lambda': 0.07424695359438366, 'colsample_bytree': 0.53, 'num_leaves': 205, 'feature_fraction': 0.9342682510955859, 'bagging_fraction': 0.7607402623204249, 'bagging_freq': 14, 'min_child_samples': 210, 'subsample': 0.17, 'learning_rate': 0.03816907791623252, 'max_depth': 17, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[140]	training's l1: 9.10865	training's rmse: 11.8199	valid_1's l1: 9.20001	valid_1's rmse: 11.7839
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.60555	training's rmse: 11.1978	valid_1's l1: 9.30256	valid_1's rmse: 11.8961


[32m[I 2023-03-09 14:17:43,741][0m Trial 24 finished with value: 11.787149781397103 and parameters: {'objective': 'rmse', 'n_estimators': 1927, 'reg_alpha': 8.114704451609162e-05, 'reg_lambda': 2.795972845112726, 'colsample_bytree': 0.05, 'num_leaves': 354, 'feature_fraction': 0.7686411397426198, 'bagging_fraction': 0.6350141252159053, 'bagging_freq': 11, 'min_child_samples': 92, 'subsample': 0.33, 'learning_rate': 0.02126392667455391, 'max_depth': 29, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[207]	training's l1: 8.95464	training's rmse: 11.6151	valid_1's l1: 9.2296	valid_1's rmse: 11.7871
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.80913	training's rmse: 11.4441	valid_1's l1: 9.2755	valid_1's rmse: 11.848


[32m[I 2023-03-09 14:17:47,289][0m Trial 25 finished with value: 11.786612558261323 and parameters: {'objective': 'rmse', 'n_estimators': 3034, 'reg_alpha': 1.3878182678301719e-06, 'reg_lambda': 0.3082584034260783, 'colsample_bytree': 0.35, 'num_leaves': 91, 'feature_fraction': 0.9859884740608733, 'bagging_fraction': 0.995152942235974, 'bagging_freq': 12, 'min_child_samples': 271, 'subsample': 0.5, 'learning_rate': 0.032299058418064706, 'max_depth': 34, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[211]	training's l1: 9.039	training's rmse: 11.7353	valid_1's l1: 9.20948	valid_1's rmse: 11.7866
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.94122	training's rmse: 11.6101	valid_1's l1: 9.31615	valid_1's rmse: 11.8739


[32m[I 2023-03-09 14:17:49,146][0m Trial 26 finished with value: 11.825292644899587 and parameters: {'objective': 'rmse', 'n_estimators': 2378, 'reg_alpha': 1.2476837649515388e-05, 'reg_lambda': 1.826124899676338, 'colsample_bytree': 0.48, 'num_leaves': 547, 'feature_fraction': 0.9232933758909432, 'bagging_fraction': 0.528887736208675, 'bagging_freq': 9, 'min_child_samples': 224, 'subsample': 0.17, 'learning_rate': 0.047913524994376455, 'max_depth': 52, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[102]	training's l1: 9.2788	training's rmse: 12.0015	valid_1's l1: 9.23236	valid_1's rmse: 11.8253
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.33543	training's rmse: 12.0955	valid_1's l1: 9.25994	valid_1's rmse: 11.8335
[1000]	training's l1: 9.22266	training's rmse: 11.9859	valid_1's l1: 9.21714	valid_1's rmse: 11.7987
[1500]	training's l1: 9.18428	training's rmse: 11.9281	valid_1's l1: 9.23112	valid_1's rmse: 11.8041


[32m[I 2023-03-09 14:17:52,306][0m Trial 27 finished with value: 11.786812713935962 and parameters: {'objective': 'rmse', 'n_estimators': 4274, 'reg_alpha': 3.039645166649787e-07, 'reg_lambda': 0.03923848836473931, 'colsample_bytree': 0.26, 'num_leaves': 3, 'feature_fraction': 0.8318969401174845, 'bagging_fraction': 0.7537465490204122, 'bagging_freq': 14, 'min_child_samples': 258, 'subsample': 0.64, 'learning_rate': 0.015225409587980587, 'max_depth': 18, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[1288]	training's l1: 9.19202	training's rmse: 11.9502	valid_1's l1: 9.21428	valid_1's rmse: 11.7868
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.71125	training's rmse: 11.3284	valid_1's l1: 9.29606	valid_1's rmse: 11.8714


[32m[I 2023-03-09 14:17:56,070][0m Trial 28 finished with value: 11.805891516743289 and parameters: {'objective': 'rmse', 'n_estimators': 1203, 'reg_alpha': 0.00010927610908182821, 'reg_lambda': 0.1851596005107168, 'colsample_bytree': 0.16999999999999998, 'num_leaves': 208, 'feature_fraction': 0.7297124470214318, 'bagging_fraction': 0.8701699641638159, 'bagging_freq': 12, 'min_child_samples': 159, 'subsample': 0.26, 'learning_rate': 0.02275979152137522, 'max_depth': 37, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m


Early stopping, best iteration is:
[199]	training's l1: 9.01256	training's rmse: 11.6948	valid_1's l1: 9.23254	valid_1's rmse: 11.8059
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.12849	training's rmse: 10.5683	valid_1's l1: 9.42062	valid_1's rmse: 12.0244


[32m[I 2023-03-09 14:18:02,364][0m Trial 29 finished with value: 11.831971173213084 and parameters: {'objective': 'rmse', 'n_estimators': 1722, 'reg_alpha': 1.194923487133325e-07, 'reg_lambda': 8.96823946979217, 'colsample_bytree': 0.6100000000000001, 'num_leaves': 92, 'feature_fraction': 0.8088060036485755, 'bagging_fraction': 0.8969048713187127, 'bagging_freq': 10, 'min_child_samples': 93, 'subsample': 0.38, 'learning_rate': 0.034974284441852375, 'max_depth': 94, 'random_state': 42, 'n_jobs': 4}. Best is trial 16 with value: 11.75806473547123.[0m
[32m[I 2023-03-09 14:18:02,379][0m A new study created in memory with name: no-name-8fc1c47c-199f-4006-bd8d-7e66b9e242ec[0m


Early stopping, best iteration is:
[150]	training's l1: 8.7353	training's rmse: 11.3611	valid_1's l1: 9.26049	valid_1's rmse: 11.832
Number of finished trials: 30
Best LGBM trial parameters: {'objective': 'rmse', 'n_estimators': 2136, 'reg_alpha': 2.4126280499148727e-06, 'reg_lambda': 0.9825528920012386, 'colsample_bytree': 0.28, 'num_leaves': 16, 'feature_fraction': 0.9876820587955093, 'bagging_fraction': 0.7183120564251952, 'bagging_freq': 12, 'min_child_samples': 293, 'subsample': 0.12000000000000001, 'learning_rate': 0.03676305450855212, 'max_depth': 60, 'random_state': 42, 'n_jobs': 4}
Best score: 11.75806473547123


[32m[I 2023-03-09 14:18:02,692][0m Trial 0 finished with value: 11.790006400173874 and parameters: {'learning_rate': 0.12180312804917959, 'l2_leaf_reg': 3.247918417062669, 'bagging_temperature': 1.6728816248481448, 'random_strength': 1.2903753399439, 'depth': 3, 'min_data_in_leaf': 181}. Best is trial 0 with value: 11.790006400173874.[0m
[32m[I 2023-03-09 14:18:02,858][0m Trial 1 finished with value: 11.873613133954489 and parameters: {'learning_rate': 0.9045290414301004, 'l2_leaf_reg': 2.186021560358404, 'bagging_temperature': 12.5694013750239, 'random_strength': 1.3891154670197983, 'depth': 2, 'min_data_in_leaf': 176}. Best is trial 0 with value: 11.790006400173874.[0m
[32m[I 2023-03-09 14:18:03,061][0m Trial 2 finished with value: 11.79208360281892 and parameters: {'learning_rate': 0.22245938131793974, 'l2_leaf_reg': 33.21351567602374, 'bagging_temperature': 0.26818477298698223, 'random_strength': 1.1682438627533505, 'depth': 4, 'min_data_in_leaf': 132}. Best is trial 0 with

Number of finished trials: 30
Best Cat trial parameters: {'learning_rate': 0.12560437453534345, 'l2_leaf_reg': 42.53962135449095, 'bagging_temperature': 5.622566206398244, 'random_strength': 1.79297844224526, 'depth': 8, 'min_data_in_leaf': 237}
Best score: 11.743682049675485
CPU times: user 37min 5s, sys: 1min 52s, total: 38min 58s
Wall time: 12min 16s


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [26]:
train = create_folds(train, Config.N_FOLDS)
# train = create_strat_folds(train, TARGET, Config.N_FOLDS)

n_folds=5, seed=42


In [27]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index(ID, inplace=True)
oof.head()

Unnamed: 0_level_0,Strength,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10.38,2
1,23.52,3
2,36.96,3
3,39.05,4
4,74.19,4


In [28]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [29]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [30]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [31]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 19 µs, sys: 0 ns, total: 19 µs
Wall time: 24.8 µs


In [32]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params3 = {
    "n_estimators": Config.N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}
    
lgbm_params = gpu_ify_lgbm(lgbm_params)

In [33]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror",
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params3 = {
    'n_estimators': Config.N_ESTIMATORS,
    'learning_rate': 0.05,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror'
}

if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [34]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)

In [35]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),
    "xgb3": xgb.XGBRegressor(**xgb_params3),


    "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm0": lgb.LGBMRegressor(),
    "lgbm3": lgb.LGBMRegressor(lgbm_params3),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),
}

## Tree Models

In [36]:
%%time

# model_lst = ["xgb3","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
model_lst = ["xgb_best_params", "lgbm_best_params", "cat_best_params","xgb3", "xgb1", "xgb2", "lgbm0", "lgbm1", "lgbm2", "lgbm3", "cat1", "cat2"]
# model_lst = = []
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=xgb_best_params
{}
fold: 1, Score: 9.409036894320561, Run Time: 8.32
fold: 2, Score: 9.2928264615452, Run Time: 8.21
fold: 3, Score: 9.371602597902706, Run Time: 8.39
fold: 4, Score: 9.483317996721153, Run Time: 9.42
fold: 5, Score: 9.7292257627616, Run Time: 8.25
Scores -> Adjusted: 9.30797364 , mean: 9.45720194, std: 0.14922830

=== Model Feature Importance ===
AgeInDays 0.60631526
SuperplasticizerComponent 0.08268008
CementComponent 0.06433448
WaterComponent 0.054532167
BlastFurnaceSlag 0.050775565
FlyAshComponent 0.04870969
CoarseAggregateComponent 0.047657117
FineAggregateComponent 0.044995666


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0,22.13408
1,36.87865
2,41.96119
3,41.2819
4,45.38861


Mode
=== Target Value Counts ===
Model Run Time: 42.77
Model=lgbm_best_params
{}
fold: 1, Score: 9.33430753535656, Run Time: 5.67
fold: 2, Score: 9.252310997936647, Run Time: 5.58
fold: 3, Score: 9.39964587921021, Run Time: 5.98
fold: 4, Score: 9.533593266231636, Run Time: 6.61
fold: 5, Score: 9.715435010797792, Run Time: 6.76
Scores -> Adjusted: 9.28433082 , mean: 9.44705854, std: 0.16272771

=== Model Feature Importance ===
FineAggregateComponent 0.2231605132233569
CementComponent 0.1866980885048442
CoarseAggregateComponent 0.18355590468709085
WaterComponent 0.12496726891856506
AgeInDays 0.08398795496203194
BlastFurnaceSlag 0.08182770358732652
SuperplasticizerComponent 0.07266300078554595
FlyAshComponent 0.04313956533123854


Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0,22.30303
1,33.60191
2,41.57354
3,42.08131
4,45.97864


Mode
=== Target Value Counts ===
Model Run Time: 30.75
Model=cat_best_params
{}
fold: 1, Score: 9.216152903197527, Run Time: 4.32
fold: 2, Score: 9.218171382472054, Run Time: 4.43
fold: 3, Score: 9.343340029051454, Run Time: 4.40
fold: 4, Score: 9.355010391430222, Run Time: 4.52
fold: 5, Score: 9.540395721268492, Run Time: 4.71
Scores -> Adjusted: 9.21592947 , mean: 9.33461409, std: 0.11868461

=== Model Feature Importance ===
AgeInDays 0.6295079073223016
CementComponent 0.08379424980280639
SuperplasticizerComponent 0.06885192167959814
WaterComponent 0.05914209180276117
FineAggregateComponent 0.05305249677252544
CoarseAggregateComponent 0.0462193330698278
BlastFurnaceSlag 0.031071235459093575
FlyAshComponent 0.02836076409108591


Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0,20.92891
1,35.22385
2,37.3686
3,42.69714
4,45.42625


Mode
=== Target Value Counts ===
Model Run Time: 22.53
Model=xgb3
{}
fold: 1, Score: 9.591613470877824, Run Time: 1.28
fold: 2, Score: 9.415113115680857, Run Time: 1.30
fold: 3, Score: 9.526932664812989, Run Time: 1.32
fold: 4, Score: 9.568835225665493, Run Time: 1.32
fold: 5, Score: 9.88312261281468, Run Time: 1.29
Scores -> Adjusted: 9.44176537 , mean: 9.59712342, std: 0.15535805

=== Model Feature Importance ===
AgeInDays 0.634637
SuperplasticizerComponent 0.08174761
CoarseAggregateComponent 0.053952668
FineAggregateComponent 0.051661685
WaterComponent 0.050079975
CementComponent 0.047334425
FlyAshComponent 0.043003216
BlastFurnaceSlag 0.03758341


Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0,27.88871
1,35.15163
2,37.53448
3,40.34013
4,46.67713


Mode
=== Target Value Counts ===
Model Run Time: 6.67
Model=xgb1
{}
fold: 1, Score: 9.84209825928242, Run Time: 0.85
fold: 2, Score: 9.702522084170921, Run Time: 0.87
fold: 3, Score: 9.79027357074974, Run Time: 0.88
fold: 4, Score: 9.864710404892744, Run Time: 0.88
fold: 5, Score: 10.264338967842047, Run Time: 0.89
Scores -> Adjusted: 9.69882049 , mean: 9.89278866, std: 0.19396817

=== Model Feature Importance ===
AgeInDays 0.54361254
SuperplasticizerComponent 0.10752861
FineAggregateComponent 0.06606827
WaterComponent 0.06416693
CementComponent 0.060588468
CoarseAggregateComponent 0.06044707
FlyAshComponent 0.04993739
BlastFurnaceSlag 0.04765066


Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0,27.3127
1,33.55209
2,38.14085
3,40.91367
4,47.12215


Mode
=== Target Value Counts ===
Model Run Time: 4.53
Model=xgb2
{}
fold: 1, Score: 9.537639794869694, Run Time: 1.53
fold: 2, Score: 9.590117954740684, Run Time: 1.54
fold: 3, Score: 9.563113373113273, Run Time: 1.56
fold: 4, Score: 9.54308209552465, Run Time: 1.48
fold: 5, Score: 9.805689799567265, Run Time: 1.52
Scores -> Adjusted: 9.50734958 , mean: 9.60792860, std: 0.10057902

=== Model Feature Importance ===
AgeInDays 0.74570817
SuperplasticizerComponent 0.05106596
WaterComponent 0.04345242
FineAggregateComponent 0.03621618
CoarseAggregateComponent 0.033899147
CementComponent 0.030861726
FlyAshComponent 0.029724082
BlastFurnaceSlag 0.029072277


Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0,27.08389
1,33.94482
2,38.654
3,39.83421
4,45.35452


Mode
=== Target Value Counts ===
Model Run Time: 7.81
Model=lgbm0
{}
fold: 1, Score: 9.39625791932152, Run Time: 0.74
fold: 2, Score: 9.420380264436687, Run Time: 0.72
fold: 3, Score: 9.322624712720717, Run Time: 0.90
fold: 4, Score: 9.620820265945634, Run Time: 0.87
fold: 5, Score: 9.702745000107736, Run Time: 0.79
Scores -> Adjusted: 9.34835021 , mean: 9.49256563, std: 0.14421542

=== Model Feature Importance ===
FineAggregateComponent 0.163
CementComponent 0.162
CoarseAggregateComponent 0.15933333333333333
WaterComponent 0.13733333333333334
SuperplasticizerComponent 0.11466666666666667
AgeInDays 0.10666666666666667
BlastFurnaceSlag 0.08766666666666667
FlyAshComponent 0.06933333333333333


Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0,21.67873
1,34.60661
2,40.2935
3,41.25614
4,47.89707


Mode
=== Target Value Counts ===
Model Run Time: 4.17
Model=lgbm1
{}
fold: 1, Score: 9.417488572731047, Run Time: 0.78
fold: 2, Score: 9.448855006985498, Run Time: 0.74
fold: 3, Score: 9.355521851267918, Run Time: 0.82
fold: 4, Score: 9.654747967273734, Run Time: 0.77
fold: 5, Score: 9.769280049993588, Run Time: 0.95
Scores -> Adjusted: 9.37265420 , mean: 9.52917869, std: 0.15652449

=== Model Feature Importance ===
FineAggregateComponent 0.18101659751037344
CementComponent 0.15871369294605808
CoarseAggregateComponent 0.15404564315352698
WaterComponent 0.1400414937759336
SuperplasticizerComponent 0.11514522821576763
AgeInDays 0.09024896265560166
BlastFurnaceSlag 0.08402489626556017
FlyAshComponent 0.07676348547717843


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0,23.60652
1,35.5162
2,40.99849
3,40.25634
4,47.27227


Mode
=== Target Value Counts ===
Model Run Time: 4.22
Model=lgbm2
{}
fold: 1, Score: 9.946166290378631, Run Time: 0.36
fold: 2, Score: 9.905541427748044, Run Time: 0.40
fold: 3, Score: 10.014246391238743, Run Time: 0.49
fold: 4, Score: 9.93594895440473, Run Time: 0.67
fold: 5, Score: 9.971055719158814, Run Time: 0.55
Scores -> Adjusted: 9.91811104 , mean: 9.95459176, std: 0.03648072

=== Model Feature Importance ===
FineAggregateComponent 0.214
CementComponent 0.178
SuperplasticizerComponent 0.164
AgeInDays 0.106
CoarseAggregateComponent 0.095
FlyAshComponent 0.095
BlastFurnaceSlag 0.081
WaterComponent 0.067


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0,27.9079
1,37.75881
2,34.09795
3,42.3843
4,40.2823


Mode
=== Target Value Counts ===
Model Run Time: 2.63
Model=lgbm3
{}
fold: 1, Score: 9.417488572731047, Run Time: 0.78
fold: 2, Score: 9.448855006985498, Run Time: 0.78
fold: 3, Score: 9.355521851267918, Run Time: 0.80
fold: 4, Score: 9.654747967273734, Run Time: 0.74
fold: 5, Score: 9.769280049993588, Run Time: 0.85
Scores -> Adjusted: 9.37265420 , mean: 9.52917869, std: 0.15652449

=== Model Feature Importance ===
FineAggregateComponent 0.18101659751037344
CementComponent 0.15871369294605808
CoarseAggregateComponent 0.15404564315352698
WaterComponent 0.1400414937759336
SuperplasticizerComponent 0.11514522821576763
AgeInDays 0.09024896265560166
BlastFurnaceSlag 0.08402489626556017
FlyAshComponent 0.07676348547717843


Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0,23.60652
1,35.5162
2,40.99849
3,40.25634
4,47.27227


Mode
=== Target Value Counts ===
Model Run Time: 4.08
Model=cat1
{}
fold: 1, Score: 9.19104246464739, Run Time: 2.29
fold: 2, Score: 9.174669405745702, Run Time: 2.24
fold: 3, Score: 9.338993063036588, Run Time: 2.34
fold: 4, Score: 9.29695125676774, Run Time: 2.39
fold: 5, Score: 9.521023900602884, Run Time: 2.40
Scores -> Adjusted: 9.17974985 , mean: 9.30453602, std: 0.12478617

=== Model Feature Importance ===
AgeInDays 0.5790244848373505
CementComponent 0.09899117001309628
SuperplasticizerComponent 0.08080817745726981
WaterComponent 0.06362168764728943
FineAggregateComponent 0.058179071466406634
CoarseAggregateComponent 0.04994742137378766
BlastFurnaceSlag 0.03925279814818057
FlyAshComponent 0.030175189056619138


Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0,21.46285
1,36.4238
2,37.67066
3,43.58667
4,44.68501


Mode
=== Target Value Counts ===
Model Run Time: 11.81
Model=cat2
{}
fold: 1, Score: 9.209502263196073, Run Time: 1.29
fold: 2, Score: 9.230605090297756, Run Time: 1.31
fold: 3, Score: 9.359731268825279, Run Time: 1.33
fold: 4, Score: 9.375407671927572, Run Time: 1.29
fold: 5, Score: 9.544256348679172, Run Time: 1.28
Scores -> Adjusted: 9.22366366 , mean: 9.34390053, std: 0.12023686

=== Model Feature Importance ===
AgeInDays 0.5816677854443589
CementComponent 0.08898353961545107
SuperplasticizerComponent 0.06091363799535688
FineAggregateComponent 0.06061727869546013
WaterComponent 0.05927220093461391
BlastFurnaceSlag 0.051278372576675106
CoarseAggregateComponent 0.050446508261201804
FlyAshComponent 0.046820676476882366


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0,23.31344
1,33.12457
2,38.51715
3,42.00106
4,43.55274


Mode
=== Target Value Counts ===
Model Run Time: 6.66
CPU times: user 5min 56s, sys: 40.3 s, total: 6min 36s
Wall time: 2min 28s


Unnamed: 0,Model,Score,StdDev,RunTime
8,lgbm2,9.95459,0.03648,2.62803
4,xgb1,9.89279,0.19397,4.52739
5,xgb2,9.60793,0.10058,7.80784
3,xgb3,9.59712,0.15536,6.674
7,lgbm1,9.52918,0.15652,4.21517
9,lgbm3,9.52918,0.15652,4.07905
6,lgbm0,9.49257,0.14422,4.17473
0,xgb_best_params,9.4572,0.14923,42.77025
1,lgbm_best_params,9.44706,0.16273,30.75158
11,cat2,9.3439,0.12024,6.65564


## Linear Models

In [37]:
model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
model_lst = ["lasso", "ridge",  "ridge_50"]
# model_lst = []
# all_cv_scores = run_models4features(model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

all_cv_scores.head()

Model=lasso
fold: 1, Score: 11.822677488454982, Run Time: 0.04
fold: 2, Score: 11.641289509965215, Run Time: 0.07
fold: 3, Score: 11.791907114570881, Run Time: 0.06
fold: 4, Score: 11.646972288117684, Run Time: 0.07
fold: 5, Score: 11.598603795765076, Run Time: 0.05
Scores -> Adjusted: 11.61080673 , mean: 11.70029004, std: 0.08948331


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0,31.98752
1,34.68057
2,31.58559
3,55.02455
4,35.1059


Mode
=== Target Value Counts ===
Model Run Time: 0.54
Model=ridge
fold: 1, Score: 11.463442883896986, Run Time: 0.06
fold: 2, Score: 11.479100804595532, Run Time: 0.06
fold: 3, Score: 11.600767405949217, Run Time: 0.07
fold: 4, Score: 11.29006557587803, Run Time: 0.06
fold: 5, Score: 11.354798113056587, Run Time: 0.07
Scores -> Adjusted: 11.33026985 , mean: 11.43763496, std: 0.10736510


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0,33.20416
1,35.6737
2,31.70702
3,58.51655
4,36.64969


Mode
=== Target Value Counts ===
Model Run Time: 0.58
Model=ridge_50
fold: 1, Score: 11.463355083703659, Run Time: 0.04
fold: 2, Score: 11.479104311599201, Run Time: 0.07
fold: 3, Score: 11.600747681554758, Run Time: 0.06
fold: 4, Score: 11.289980742512466, Run Time: 0.06
fold: 5, Score: 11.354759002595294, Run Time: 0.06
Scores -> Adjusted: 11.33020485 , mean: 11.43758936, std: 0.10738452


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0,33.20397
1,35.67351
2,31.70739
3,58.51938
4,36.65004


Mode
=== Target Value Counts ===
Model Run Time: 0.53


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb_best_params,9.4572,0.14923,42.77025
1,lgbm_best_params,9.44706,0.16273,30.75158
2,cat_best_params,9.33461,0.11868,22.53126
3,xgb3,9.59712,0.15536,6.674
4,xgb1,9.89279,0.19397,4.52739


In [38]:
sample_submission.head(20)

Unnamed: 0,id,Strength,target_xgb_best_params,target_lgbm_best_params,target_cat_best_params,target_xgb3,target_xgb1,target_xgb2,target_lgbm0,target_lgbm1,target_lgbm2,target_lgbm3,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,5407,35.452,48.29399,48.75241,46.55209,45.84471,43.26007,45.36552,46.94644,49.61002,43.91831,49.61002,47.85299,45.03017,34.55594,35.23381,35.23382
1,5408,35.452,18.2487,18.74569,17.84006,19.19124,15.33723,19.12546,18.54209,18.02727,24.7372,18.02727,18.78228,19.52258,29.61738,26.89847,26.89708
2,5409,35.452,32.67299,32.82362,33.65968,31.57042,31.07011,31.22095,32.7445,32.72242,31.99861,32.72242,33.59458,33.4355,30.29926,26.20377,26.20286
3,5410,35.452,45.74353,45.86399,46.90075,45.73254,45.13286,45.76934,45.77196,44.727,42.0995,44.727,46.51212,46.00301,39.21824,38.68754,38.68819
4,5411,35.452,26.69867,23.16961,30.93609,24.19537,26.446,23.73058,29.08497,25.36981,31.0911,25.36981,28.4028,31.4482,32.83587,31.91047,31.90966
5,5412,35.452,43.66986,44.0188,38.93839,43.8257,43.3175,41.39675,44.11692,41.72337,40.55855,41.72337,39.88214,39.45014,34.43591,34.81558,34.81534
6,5413,35.452,30.90171,31.6491,32.17914,28.74687,22.61499,28.58735,27.93785,27.65293,34.81447,27.65293,32.63767,33.61909,30.92351,26.85027,26.84672
7,5414,35.452,21.38211,20.24984,21.69677,23.05063,18.60314,24.01565,20.97477,21.29445,25.77507,21.29445,21.57829,21.96462,29.86829,31.18245,31.18225
8,5415,35.452,46.5256,44.63243,40.98491,39.86927,48.45671,44.90824,45.31784,50.09437,41.89705,50.09437,43.45535,41.97435,35.12752,39.92727,39.92773
9,5416,35.452,35.16926,34.31454,37.52943,30.57672,29.428,32.00682,36.73164,35.8751,35.73388,35.8751,37.70526,37.2384,33.81522,32.64063,32.63977


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [39]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [40]:
model_lst

['lasso', 'ridge', 'ridge_50']

In [41]:
model_lst = ["xgb1", "xgb2", "cat1", "lgbm0", "lgbm1"]

In [42]:
len(model_lst)

5

In [43]:
target_names = [f"target_{model}" for model in model_lst]
target_names

['target_xgb1', 'target_xgb2', 'target_cat1', 'target_lgbm0', 'target_lgbm1']

In [44]:
sample_submission[TARGET] = sample_submission[target_names].sum(axis=1) / len(model_lst)

In [45]:
sample_submission[[ID, TARGET]].to_csv("submission_models_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,18.05123
3598,9005,38.89848
3599,9006,16.50065
3600,9007,26.95775
3601,9008,32.75979
3602,9009,40.77611
3603,9010,29.23822
3604,9011,20.96626


In [46]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 3 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

# sample_submission[TARGET] = sample_submission[TARGET].astype(int)

In [47]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,18.00752
3598,9005,38.90091
3599,9006,16.78782
3600,9007,26.19372
3601,9008,31.3641
3602,9009,41.99019
3603,9010,29.03283
3604,9011,21.42395


In [48]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
12,lasso,11.70029,0.08948,0.53848
13,ridge,11.43763,0.10737,0.58045
14,ridge_50,11.43759,0.10738,0.53023
8,lgbm2,9.95459,0.03648,2.62803
4,xgb1,9.89279,0.19397,4.52739
5,xgb2,9.60793,0.10058,7.80784
3,xgb3,9.59712,0.15536,6.674
7,lgbm1,9.52918,0.15652,4.21517
9,lgbm3,9.52918,0.15652,4.07905
6,lgbm0,9.49257,0.14422,4.17473


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [49]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [50]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [51]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
   id  pred_cat1
0   0   21.46285
1   1   36.42380
2   2   37.67066
3   3   43.58667
4   4   44.68501
Processing train_pred_cat2, train_pred_cat2.csv
   id  pred_cat2
0   0   23.31344
1   1   33.12457
2   2   38.51715
3   3   42.00106
4   4   43.55274
Processing train_pred_lgbm1, train_pred_lgbm1.csv
   id  pred_lgbm1
0   0    23.60652
1   1    35.51620
2   2    40.99849
3   3    40.25634
4   4    47.27227
Processing train_pred_lgbm2, train_pred_lgbm2.csv
   id  pred_lgbm2
0   0    27.90790
1   1    37.75881
2   2    34.09795
3   3    42.38430
4   4    40.28230
Processing train_pred_xgb1, train_pred_xgb1.csv
   id  pred_xgb1
0   0   27.31270
1   1   33.55209
2   2   38.14085
3   3   40.91367
4   4   47.12215
submission_cat1, submission_cat1.csv
     id  Strength
0  5407  47.85299
1  5408  18.78228
2  5409  33.59458
3  5410  46.51212
4  5411  28.40280
submission_cat2, submission_cat2.csv
     id  Strength
0  5407  45.03017
1  5408  19.5225

In [52]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,21.46285,23.31344,23.60652,27.9079,27.3127
1,36.4238,33.12457,35.5162,37.75881,33.55209
2,37.67066,38.51715,40.99849,34.09795,38.14085
3,43.58667,42.00106,40.25634,42.3843,40.91367
4,44.68501,43.55274,47.27227,40.2823,47.12215


In [53]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,47.85299,45.03017,49.61002,43.91831,43.26007
1,18.78228,19.52258,18.02727,24.7372,15.33723
2,33.59458,33.4355,32.72242,31.99861,31.07011
3,46.51212,46.00301,44.727,42.0995,45.13286
4,28.4028,31.4482,25.36981,31.0911,26.446


In [54]:
type(preds_df)

pandas.core.frame.DataFrame

In [55]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [56]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [57]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,21.46285,23.31344,23.60652,27.9079,27.3127
1,36.4238,33.12457,35.5162,37.75881,33.55209
2,37.67066,38.51715,40.99849,34.09795,38.14085
3,43.58667,42.00106,40.25634,42.3843,40.91367
4,44.68501,43.55274,47.27227,40.2823,47.12215


In [58]:
# preds_df[useful_features].head()

In [59]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [60]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [61]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
12,lasso,11.7,0.09,0.54
13,ridge,11.44,0.11,0.58
14,ridge_50,11.44,0.11,0.53
8,lgbm2,9.95,0.04,2.63
4,xgb1,9.89,0.19,4.53
5,xgb2,9.61,0.1,7.81
3,xgb3,9.6,0.16,6.67
7,lgbm1,9.53,0.16,4.22
9,lgbm3,9.53,0.16,4.08
6,lgbm0,9.49,0.14,4.17
