<a href="https://www.kaggle.com/code/mmellinger66/s3e9-concrete-strength-models?scriptVersionId=120985886" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 9 - Concrete Strength Models</h1>
</div>

## Problem Type

Regression

## Evaluation Metric

$$RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y_i})^2}$$

```python
score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
```

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="Strength"
ID="id"

# Optuna
objective_direction = "minimize"  # minimize, maximize

In [3]:
class Config:
    path:str = "../input/playground-series-s3e9/"
    gpu:bool = False
    optimize:bool = True
    n_optuna_trials:int = 30 # 5, 10, 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 500  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5
        

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    """Deprecated: Use summary_statistics()"""
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    if Config.gpu:
        lgbm_dict["device"] = "gpu"
        lgbm_dict["boosting_type"] = "gbdt"
        lgbm_dict["gpu_platform_id"] = 0
        lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    if Config.gpu:
        params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
#         "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),

        "eval_metric": "rmse",  # auc, rmse, mae
        "objective": "reg:squarederror", # Normal Distribution
#         "objective": "reg:gamma", # Gamma Distribution

        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["hist"]
        ),  # hist, gpu_hist
#         "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    #  model = CatBoostClassifier(**cb_params)
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=5407, Columns=10
 test data : Rows=3605, Columns=9
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19



[1m[91m=== Train Column Names ===[0m



Index(['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent',
       'WaterComponent', 'SuperplasticizerComponent',
       'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays',
       'Strength'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

AgeInDays                    2.74687
SuperplasticizerComponent    1.41169
FlyAshComponent              1.30469
BlastFurnaceSlag             1.12120
Strength                     0.38073
CementComponent              0.34128
id                           0.00000
CoarseAggregateComponent    -0.08145
WaterComponent              -0.21528
FineAggregateComponent      -0.44738
dtype: float64
CPU times: user 53.7 ms, sys: 10.3 ms, total: 64 ms
Wall time: 97.2 ms


In [8]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19


In [9]:
# original = pd.read_csv("../input/gemstone-price-prediction/cubic_zirconia.csv", index_col=[0])
# original = original[-original.depth.isna()]
# original.head()

In [10]:
# original.shape

In [11]:
# train['is_original']    = 0
# test['is_original']     = 0
# original['is_original'] = 1
# combined = pd.concat([train, original], ignore_index=True).drop_duplicates()
# train = combined

In [12]:
# combined.head()

In [13]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
CementComponent,5407.0,299.17,105.54,102.0,213.7,297.2,375.0,540.0,11138.2,0.34,-0.55
BlastFurnaceSlag,5407.0,58.61,83.42,0.0,0.0,0.0,122.6,359.4,6958.53,1.12,0.0
FlyAshComponent,5407.0,31.87,54.61,0.0,0.0,0.0,79.0,200.1,2981.71,1.3,0.1
WaterComponent,5407.0,185.08,18.52,121.8,175.1,187.4,192.0,247.0,342.9,-0.22,0.84
SuperplasticizerComponent,5407.0,4.11,5.69,0.0,0.0,0.0,8.05,32.2,32.4,1.41,2.2
CoarseAggregateComponent,5407.0,992.0,77.15,801.0,938.2,978.0,1047.0,1145.0,5951.82,-0.08,-0.56
FineAggregateComponent,5407.0,771.22,78.73,594.0,734.3,781.2,821.0,992.6,6197.67,-0.45,-0.01
AgeInDays,5407.0,51.75,70.01,1.0,7.0,28.0,56.0,365.0,4900.98,2.75,8.27
Strength,5407.0,35.45,16.4,2.33,23.64,33.95,45.85,82.6,269.02,0.38,-0.36


## Outlier Detection

In [14]:
# https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
    
def iqr(data:pd.DataFrame, var:str):# outliers detecion .
    q1 = np.quantile(data[var], 0.25)
    q3 = np.quantile(data[var], 0.75)
    diff = q3 - q1
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    return data[(data[var] < lower_t) | (data[var] > upper_t)]

# iqr(train, "squareMeters")

In [15]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy

def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(train)


Unnamed: 0,Outlier_percentage
WaterComponent,8.93286
AgeInDays,8.63695
FineAggregateComponent,2.82967
SuperplasticizerComponent,1.38709
Strength,0.61032
BlastFurnaceSlag,0.38839
FlyAshComponent,0.09247
id,0.0
CementComponent,0.0
CoarseAggregateComponent,0.0


In [16]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy
    
def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(test)


Unnamed: 0,Outlier_percentage
FineAggregateComponent,8.54369
WaterComponent,8.2663
AgeInDays,7.93343
SuperplasticizerComponent,1.47018
BlastFurnaceSlag,0.41609
id,0.0
CementComponent,0.0
FlyAshComponent,0.0
CoarseAggregateComponent,0.0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

In [17]:
# train.drop(['cityCode'], axis=1, inplace=True)
# test.drop(['cityCode'], axis=1, inplace=True)


## Handle Outliers
- https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
- https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

In [18]:
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors', 'cityCode', 'squareMeters']
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors',  'squareMeters']

In [19]:
# https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

def remove_outliers(df:pd.DataFrame) -> pd.DataFrame:
    for c in features_with_outliers:
        if c == 'garage':
            first_percentile = df[c].quantile(0.001)
            df = df[df[c] > first_percentile]

        ninety_ninth_percentile = df[c].quantile(0.999)
        df = df[df[c] < ninety_ninth_percentile]
        #df_t = df_t[(df_t[c] > first_percentile) & (df_t[c] < ninety_ninth_percentile)]
    return df


In [20]:
# print(f'Before: {len(train)}')
# train = remove_outliers(train)
# print(f'After: {len(train)}')

In [21]:
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19
5,5,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43
6,6,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1
7,7,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94
8,8,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14
9,9,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94


In [22]:
train = train.reset_index(drop=True).copy()
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19
5,5,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43
6,6,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1
7,7,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94
8,8,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14
9,9,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94


In [23]:
excluded_features = [TARGET, ID, "fold"]

In [24]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['CementComponent',
 'BlastFurnaceSlag',
 'FlyAshComponent',
 'WaterComponent',
 'SuperplasticizerComponent',
 'CoarseAggregateComponent',
 'FineAggregateComponent',
 'AgeInDays']

In [25]:
train, test = label_encoder(train, test, cat_features)
# train = pd.get_dummies(train,columns=['cut','color','clarity']) # Will remove original feature names
# test = pd.get_dummies(test,columns=['cut','color','clarity'])

In [26]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19


In [27]:
# cont_features, cat_features = feature_distribution_types(train, display=True)
# show_cardinality(train, cat_features)

# cont_features = [feature for feature in cont_features if feature not in excluded_features]
# cat_features = [feature for feature in cat_features if feature not in excluded_features]

# FEATURES = cont_features + cat_features
# FEATURES

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [28]:
%%time

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
# best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best XGB trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best LGBM trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
# best_cb_params = {}
best_cb_params = {'learning_rate': 0.45743264601999495,
                  'l2_leaf_reg': 41.338946049390074,
                  'bagging_temperature': 0.3472567739474319,
                  'random_strength': 1.7332249677756242, 
                  'depth': 1,
                  'min_data_in_leaf': 6}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best Cat trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

[32m[I 2023-03-04 01:24:33,573][0m A new study created in memory with name: no-name-c70519c8-fb0e-4ff1-b8db-52d7a3164624[0m
[32m[I 2023-03-04 01:24:50,629][0m Trial 0 finished with value: 11.839232760919627 and parameters: {'use_label_encoder': False, 'n_estimators': 2800, 'learning_rate': 0.12008061092832557, 'subsample': 0.38, 'colsample_bytree': 0.37, 'max_depth': 13, 'gamma': 86.7, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.3986302001550659, 'reg_alpha': 6.20176639137037e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 156.47430269317343}. Best is trial 0 with value: 11.839232760919627.[0m


Number of boosting rounds: 232


[32m[I 2023-03-04 01:25:18,506][0m Trial 1 finished with value: 11.826278494585656 and parameters: {'use_label_encoder': False, 'n_estimators': 4700, 'learning_rate': 0.14333282717619428, 'subsample': 0.35, 'colsample_bytree': 0.9900000000000001, 'max_depth': 1, 'gamma': 18.7, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.00013346577910730995, 'reg_alpha': 35.5879127433262, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.26085825899936316}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 342


[32m[I 2023-03-04 01:25:32,903][0m Trial 2 finished with value: 12.091740390004365 and parameters: {'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.044236933515633646, 'subsample': 0.13, 'colsample_bytree': 0.31, 'max_depth': 8, 'gamma': 70.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.07020104240391262, 'reg_alpha': 13.831806774624036, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.23183974196121274}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 231


[32m[I 2023-03-04 01:25:46,179][0m Trial 3 finished with value: 12.08019781559042 and parameters: {'use_label_encoder': False, 'n_estimators': 1700, 'learning_rate': 0.024387045116059443, 'subsample': 0.79, 'colsample_bytree': 0.6000000000000001, 'max_depth': 19, 'gamma': 12.4, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.0020563383244945896, 'reg_alpha': 1.5000873604135772e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 18.674451415395406}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 207


[32m[I 2023-03-04 01:26:06,248][0m Trial 4 finished with value: 11.852834308017798 and parameters: {'use_label_encoder': False, 'n_estimators': 2700, 'learning_rate': 0.043768430275869366, 'subsample': 0.39, 'colsample_bytree': 0.9400000000000001, 'max_depth': 7, 'gamma': 68.9, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 3.142517568540083e-08, 'reg_alpha': 1.6876375194896414e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.30794737945847944}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 77


[32m[I 2023-03-04 01:26:28,198][0m Trial 5 finished with value: 11.90753282833453 and parameters: {'use_label_encoder': False, 'n_estimators': 3800, 'learning_rate': 0.02856808705470679, 'subsample': 0.66, 'colsample_bytree': 0.22000000000000003, 'max_depth': 2, 'gamma': 70.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 1.83821707082052, 'reg_alpha': 0.0002525243878134322, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.285751650304226}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 1320


[32m[I 2023-03-04 01:26:58,525][0m Trial 6 finished with value: 11.843882979537328 and parameters: {'use_label_encoder': False, 'n_estimators': 4800, 'learning_rate': 0.05819284211518995, 'subsample': 0.64, 'colsample_bytree': 0.5700000000000001, 'max_depth': 5, 'gamma': 75.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 1.2367201519276162e-05, 'reg_alpha': 0.00047360057588738977, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.166489772273216}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 121


[32m[I 2023-03-04 01:27:18,191][0m Trial 7 finished with value: 11.942468147566952 and parameters: {'use_label_encoder': False, 'n_estimators': 3100, 'learning_rate': 0.10441715350613222, 'subsample': 0.6, 'colsample_bytree': 0.26, 'max_depth': 4, 'gamma': 3.8000000000000003, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 4.184956536231227e-07, 'reg_alpha': 4.771749823924653, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.28971327612637465}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 166


[32m[I 2023-03-04 01:27:28,208][0m Trial 8 finished with value: 11.865736314112079 and parameters: {'use_label_encoder': False, 'n_estimators': 1500, 'learning_rate': 0.0913171591449225, 'subsample': 0.56, 'colsample_bytree': 0.65, 'max_depth': 9, 'gamma': 59.800000000000004, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 6.499029675653817e-05, 'reg_alpha': 2.0482142361373075e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 56.30350964235324}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 42


[32m[I 2023-03-04 01:27:47,809][0m Trial 9 finished with value: 16.025547475112145 and parameters: {'use_label_encoder': False, 'n_estimators': 3300, 'learning_rate': 0.2163909666169746, 'subsample': 0.4, 'colsample_bytree': 0.8700000000000001, 'max_depth': 7, 'gamma': 42.6, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.00010167783948646896, 'reg_alpha': 0.008210517966158473, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 859.1725759935913}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 660


[32m[I 2023-03-04 01:28:19,286][0m Trial 10 finished with value: 11.872824697951017 and parameters: {'use_label_encoder': False, 'n_estimators': 5000, 'learning_rate': 0.014360927439169488, 'subsample': 0.12000000000000001, 'colsample_bytree': 0.060000000000000005, 'max_depth': 13, 'gamma': 31.3, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 89.8779923651078, 'reg_alpha': 89.66362946368423, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.0670146218558423}. Best is trial 1 with value: 11.826278494585656.[0m


Number of boosting rounds: 4917


[32m[I 2023-03-04 01:28:45,956][0m Trial 11 finished with value: 11.812964538059077 and parameters: {'use_label_encoder': False, 'n_estimators': 4200, 'learning_rate': 0.21909267929282106, 'subsample': 0.32, 'colsample_bytree': 0.77, 'max_depth': 14, 'gamma': 93.4, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.029000863208597565, 'reg_alpha': 1.8525483595504976e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 134.0883929741389}. Best is trial 11 with value: 11.812964538059077.[0m


Number of boosting rounds: 26


[32m[I 2023-03-04 01:29:10,887][0m Trial 12 finished with value: 12.187720912575347 and parameters: {'use_label_encoder': False, 'n_estimators': 4200, 'learning_rate': 0.24756407461943647, 'subsample': 1.0, 'colsample_bytree': 0.78, 'max_depth': 18, 'gamma': 98.7, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.0046509204630114384, 'reg_alpha': 2.5740912835810228e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 13.952400133986595}. Best is trial 11 with value: 11.812964538059077.[0m


Number of boosting rounds: 14


[32m[I 2023-03-04 01:29:36,504][0m Trial 13 finished with value: 11.951343997658528 and parameters: {'use_label_encoder': False, 'n_estimators': 4300, 'learning_rate': 0.15910065432258466, 'subsample': 0.27, 'colsample_bytree': 0.9800000000000001, 'max_depth': 14, 'gamma': 26.200000000000003, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.003705767662335669, 'reg_alpha': 0.15848774819523223, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 255.29458161964558}. Best is trial 11 with value: 11.812964538059077.[0m


Number of boosting rounds: 388


[32m[I 2023-03-04 01:30:12,192][0m Trial 14 finished with value: 12.870976464454714 and parameters: {'use_label_encoder': False, 'n_estimators': 3800, 'learning_rate': 0.1695554057883031, 'subsample': 0.26, 'colsample_bytree': 0.7500000000000001, 'max_depth': 16, 'gamma': 47.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 5.761088687072841e-06, 'reg_alpha': 0.024562363604553574, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.1208861464049579}. Best is trial 11 with value: 11.812964538059077.[0m


Number of boosting rounds: 17


[32m[I 2023-03-04 01:30:38,281][0m Trial 15 finished with value: 11.78556326402402 and parameters: {'use_label_encoder': False, 'n_estimators': 4500, 'learning_rate': 0.23945909225405923, 'subsample': 0.28, 'colsample_bytree': 0.8200000000000001, 'max_depth': 1, 'gamma': 20.8, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.00029232112424727264, 'reg_alpha': 1.596734075355574e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 55.375223701820374}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 239


[32m[I 2023-03-04 01:31:03,450][0m Trial 16 finished with value: 11.950339763371783 and parameters: {'use_label_encoder': False, 'n_estimators': 3800, 'learning_rate': 0.2436020834202456, 'subsample': 0.21000000000000002, 'colsample_bytree': 0.7500000000000001, 'max_depth': 12, 'gamma': 34.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.053984534422149504, 'reg_alpha': 1.1398813492531075e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 55.64455435020249}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 63


[32m[I 2023-03-04 01:31:09,524][0m Trial 17 finished with value: 11.965095327920597 and parameters: {'use_label_encoder': False, 'n_estimators': 1000, 'learning_rate': 0.08428704648291188, 'subsample': 0.48, 'colsample_bytree': 0.43, 'max_depth': 10, 'gamma': 97.60000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.0004915770788349197, 'reg_alpha': 1.355652276266819e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 430.7550754555326}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 484


[32m[I 2023-03-04 01:31:38,913][0m Trial 18 finished with value: 11.822410856610494 and parameters: {'use_label_encoder': False, 'n_estimators': 4300, 'learning_rate': 0.17824861562296287, 'subsample': 0.49, 'colsample_bytree': 0.8200000000000001, 'max_depth': 15, 'gamma': 56.400000000000006, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.015178450078726835, 'reg_alpha': 1.183321631352644e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 75.20913915457817}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 35


[32m[I 2023-03-04 01:32:01,276][0m Trial 19 finished with value: 12.005608966101645 and parameters: {'use_label_encoder': False, 'n_estimators': 3500, 'learning_rate': 0.13239472782836648, 'subsample': 0.22, 'colsample_bytree': 0.7100000000000001, 'max_depth': 17, 'gamma': 0.30000000000000004, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.0010416544154575312, 'reg_alpha': 1.695441908212453e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 135.23690949330862}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 339


[32m[I 2023-03-04 01:32:28,324][0m Trial 20 finished with value: 16.49602867991335 and parameters: {'use_label_encoder': False, 'n_estimators': 4600, 'learning_rate': 0.19149190611082367, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.8800000000000001, 'max_depth': 11, 'gamma': 85.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.9528491135826906e-06, 'reg_alpha': 5.537530477325495e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 943.9464462136162}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 16


[32m[I 2023-03-04 01:32:57,263][0m Trial 21 finished with value: 11.900165628502439 and parameters: {'use_label_encoder': False, 'n_estimators': 4300, 'learning_rate': 0.16942763120553359, 'subsample': 0.47, 'colsample_bytree': 0.8700000000000001, 'max_depth': 15, 'gamma': 61.900000000000006, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.012000846951184143, 'reg_alpha': 1.0780062662223457e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 80.45526429322351}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 35


[32m[I 2023-03-04 01:33:25,842][0m Trial 22 finished with value: 12.090624232954294 and parameters: {'use_label_encoder': False, 'n_estimators': 4200, 'learning_rate': 0.24630784629645225, 'subsample': 0.45999999999999996, 'colsample_bytree': 0.48, 'max_depth': 15, 'gamma': 55.900000000000006, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.01734063411948322, 'reg_alpha': 8.073192211577314e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 36.43589077444479}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 40


[32m[I 2023-03-04 01:33:53,502][0m Trial 23 finished with value: 12.041023901328636 and parameters: {'use_label_encoder': False, 'n_estimators': 4500, 'learning_rate': 0.19754974889927507, 'subsample': 0.18, 'colsample_bytree': 0.81, 'max_depth': 17, 'gamma': 42.900000000000006, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.0006492277254328036, 'reg_alpha': 2.539847567623201e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 126.31317330727211}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 45


[32m[I 2023-03-04 01:34:17,326][0m Trial 24 finished with value: 11.965432253456111 and parameters: {'use_label_encoder': False, 'n_estimators': 3900, 'learning_rate': 0.12878827850063956, 'subsample': 0.73, 'colsample_bytree': 0.67, 'max_depth': 20, 'gamma': 84.10000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.4626815999812784, 'reg_alpha': 9.70796412665548e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 23.064962999710943}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 26


[32m[I 2023-03-04 01:34:47,961][0m Trial 25 finished with value: 11.9346740802571 and parameters: {'use_label_encoder': False, 'n_estimators': 5000, 'learning_rate': 0.19370733500466727, 'subsample': 0.32, 'colsample_bytree': 0.8500000000000001, 'max_depth': 11, 'gamma': 15.100000000000001, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.020390476797556352, 'reg_alpha': 4.085957040234086e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 279.73611053702837}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 50


[32m[I 2023-03-04 01:35:10,442][0m Trial 26 finished with value: 11.80820800464627 and parameters: {'use_label_encoder': False, 'n_estimators': 3500, 'learning_rate': 0.15884619918733153, 'subsample': 0.52, 'colsample_bytree': 0.54, 'max_depth': 4, 'gamma': 36.7, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.00046206322985373123, 'reg_alpha': 1.189241421118628e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 32.43757205331431}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 36


[32m[I 2023-03-04 01:35:33,065][0m Trial 27 finished with value: 11.836520216472818 and parameters: {'use_label_encoder': False, 'n_estimators': 3500, 'learning_rate': 0.10357977724928373, 'subsample': 0.86, 'colsample_bytree': 0.5, 'max_depth': 5, 'gamma': 24.1, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.0004187837114340476, 'reg_alpha': 8.741568431170826e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 29.22858073833243}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 97


[32m[I 2023-03-04 01:35:56,976][0m Trial 28 finished with value: 11.795825047364893 and parameters: {'use_label_encoder': False, 'n_estimators': 4000, 'learning_rate': 0.14478906789629997, 'subsample': 0.43000000000000005, 'colsample_bytree': 0.5800000000000001, 'max_depth': 3, 'gamma': 37.0, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.6802895247743154e-05, 'reg_alpha': 5.40872807828743e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 9.685011398816306}. Best is trial 15 with value: 11.78556326402402.[0m


Number of boosting rounds: 129


[32m[I 2023-03-04 01:36:12,355][0m Trial 29 finished with value: 11.829412248142624 and parameters: {'use_label_encoder': False, 'n_estimators': 2400, 'learning_rate': 0.12828199366116858, 'subsample': 0.4, 'colsample_bytree': 0.39, 'max_depth': 3, 'gamma': 37.2, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 2.294951282114358e-05, 'reg_alpha': 4.6586046795337933e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 12.469241391215622}. Best is trial 15 with value: 11.78556326402402.[0m
[32m[I 2023-03-04 01:36:12,376][0m A new study created in memory with name: no-name-8aa9677f-2c1b-4fe9-a923-8049fe69f0bb[0m


Number of boosting rounds: 80
Number of finished trials: 30
Best XGB trial parameters: {'use_label_encoder': False, 'n_estimators': 4500, 'learning_rate': 0.23945909225405923, 'subsample': 0.28, 'colsample_bytree': 0.8200000000000001, 'max_depth': 1, 'gamma': 20.8, 'booster': 'gbtree', 'tree_method': 'hist', 'reg_lambda': 0.00029232112424727264, 'reg_alpha': 1.596734075355574e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 55.375223701820374}
Best score: 11.78556326402402
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.92568	training's rmse: 11.6075	valid_1's l1: 9.26179	valid_1's rmse: 11.8441


[32m[I 2023-03-04 01:36:13,755][0m Trial 0 finished with value: 11.767030770953275 and parameters: {'objective': 'rmse', 'n_estimators': 2535, 'reg_alpha': 0.0012119503588163708, 'reg_lambda': 1.786523924375265e-08, 'colsample_bytree': 0.7300000000000001, 'num_leaves': 621, 'feature_fraction': 0.683980716117509, 'bagging_fraction': 0.4591863247313571, 'bagging_freq': 1, 'min_child_samples': 210, 'subsample': 0.66, 'learning_rate': 0.0640344829979919, 'max_depth': 6, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[149]	training's l1: 9.14469	training's rmse: 11.8931	valid_1's l1: 9.18155	valid_1's rmse: 11.767
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.24589	training's rmse: 10.7127	valid_1's l1: 9.41894	valid_1's rmse: 12.0522


[32m[I 2023-03-04 01:36:16,810][0m Trial 1 finished with value: 11.819050782657703 and parameters: {'objective': 'rmse', 'n_estimators': 1766, 'reg_alpha': 7.154543020675008e-08, 'reg_lambda': 0.030544248004081957, 'colsample_bytree': 0.39, 'num_leaves': 37, 'feature_fraction': 0.7206098356468035, 'bagging_fraction': 0.8749400689109591, 'bagging_freq': 4, 'min_child_samples': 105, 'subsample': 0.69, 'learning_rate': 0.032433250995786096, 'max_depth': 85, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[135]	training's l1: 8.86711	training's rmse: 11.5016	valid_1's l1: 9.26293	valid_1's rmse: 11.8191
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.13733	valid_1's l1: 9.35426


[32m[I 2023-03-04 01:36:17,743][0m Trial 2 finished with value: 12.028020452538373 and parameters: {'objective': 'mae', 'n_estimators': 3942, 'reg_alpha': 0.03296702607486116, 'reg_lambda': 0.017945209248368496, 'colsample_bytree': 0.44, 'num_leaves': 416, 'feature_fraction': 0.8134445942829929, 'bagging_fraction': 0.2631115498953501, 'bagging_freq': 1, 'min_child_samples': 245, 'subsample': 0.47, 'learning_rate': 0.06507388359906627, 'max_depth': 98, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[152]	training's l1: 9.30858	valid_1's l1: 9.25724
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 12.5825	valid_1's l1: 12.7882


[32m[I 2023-03-04 01:36:18,433][0m Trial 3 finished with value: 15.581956592322571 and parameters: {'objective': 'mae', 'n_estimators': 764, 'reg_alpha': 0.1088191911043753, 'reg_lambda': 1.1023069118371728e-07, 'colsample_bytree': 0.38, 'num_leaves': 535, 'feature_fraction': 0.6622155419490668, 'bagging_fraction': 0.15318103361504634, 'bagging_freq': 4, 'min_child_samples': 242, 'subsample': 0.5700000000000001, 'learning_rate': 0.014411823195038457, 'max_depth': 75, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Did not meet early stopping. Best iteration is:
[756]	training's l1: 12.4215	valid_1's l1: 12.6219
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.90278	valid_1's l1: 9.12729


[32m[I 2023-03-04 01:36:20,151][0m Trial 4 finished with value: 11.819499399951644 and parameters: {'objective': 'mae', 'n_estimators': 2693, 'reg_alpha': 0.010458946280336534, 'reg_lambda': 0.009789055512629822, 'colsample_bytree': 0.8300000000000001, 'num_leaves': 505, 'feature_fraction': 0.9204501589549366, 'bagging_fraction': 0.6502254148660712, 'bagging_freq': 6, 'min_child_samples': 273, 'subsample': 0.47, 'learning_rate': 0.022122386808176308, 'max_depth': 94, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[239]	training's l1: 9.04128	valid_1's l1: 9.09069
Training until validation scores don't improve for 500 rounds


[32m[I 2023-03-04 01:36:21,257][0m Trial 5 finished with value: 11.864427113302392 and parameters: {'objective': 'mae', 'n_estimators': 4466, 'reg_alpha': 0.014354422702518556, 'reg_lambda': 2.6661824115277572e-06, 'colsample_bytree': 0.9600000000000001, 'num_leaves': 705, 'feature_fraction': 0.9015523261414601, 'bagging_fraction': 0.31080739500148197, 'bagging_freq': 13, 'min_child_samples': 122, 'subsample': 0.6, 'learning_rate': 0.17334134915810012, 'max_depth': 70, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


[500]	training's l1: 8.44916	valid_1's l1: 9.20914
Early stopping, best iteration is:
[42]	training's l1: 9.06069	valid_1's l1: 9.16983
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.22119	training's rmse: 10.6615	valid_1's l1: 9.53548	valid_1's rmse: 12.1285


[32m[I 2023-03-04 01:36:23,067][0m Trial 6 finished with value: 11.81811667821032 and parameters: {'objective': 'rmse', 'n_estimators': 2485, 'reg_alpha': 3.3479180255067665e-06, 'reg_lambda': 0.005638464576392926, 'colsample_bytree': 0.1, 'num_leaves': 404, 'feature_fraction': 0.4206999498325035, 'bagging_fraction': 0.7064690029658114, 'bagging_freq': 15, 'min_child_samples': 141, 'subsample': 0.92, 'learning_rate': 0.12481623381488552, 'max_depth': 70, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[67]	training's l1: 8.98427	training's rmse: 11.6762	valid_1's l1: 9.23571	valid_1's rmse: 11.8181
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.11476	training's rmse: 11.8487	valid_1's l1: 9.21498	valid_1's rmse: 11.7886


[32m[I 2023-03-04 01:36:25,136][0m Trial 7 finished with value: 11.781022291554427 and parameters: {'objective': 'rmse', 'n_estimators': 2701, 'reg_alpha': 2.6850308024042835e-07, 'reg_lambda': 5.164100054326421e-06, 'colsample_bytree': 0.39, 'num_leaves': 530, 'feature_fraction': 0.8044522196458971, 'bagging_fraction': 0.28554069438207463, 'bagging_freq': 1, 'min_child_samples': 119, 'subsample': 0.78, 'learning_rate': 0.01692942038522001, 'max_depth': 63, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[424]	training's l1: 9.14347	training's rmse: 11.8832	valid_1's l1: 9.20546	valid_1's rmse: 11.781
Training until validation scores don't improve for 500 rounds


[32m[I 2023-03-04 01:36:26,253][0m Trial 8 finished with value: 11.830394764827366 and parameters: {'objective': 'mae', 'n_estimators': 3473, 'reg_alpha': 0.006601019931023633, 'reg_lambda': 2.555178143238346e-05, 'colsample_bytree': 0.7200000000000001, 'num_leaves': 141, 'feature_fraction': 0.32021018471071205, 'bagging_fraction': 0.6605041219686502, 'bagging_freq': 8, 'min_child_samples': 286, 'subsample': 0.91, 'learning_rate': 0.0922025895513653, 'max_depth': 38, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


[500]	training's l1: 8.7406	valid_1's l1: 9.18216
Early stopping, best iteration is:
[71]	training's l1: 9.09808	valid_1's l1: 9.10315
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.41961	training's rmse: 12.1898	valid_1's l1: 9.40964	valid_1's rmse: 12.0547
[1000]	training's l1: 9.39315	training's rmse: 12.0871	valid_1's l1: 9.45873	valid_1's rmse: 12.0414


[32m[I 2023-03-04 01:36:27,572][0m Trial 9 finished with value: 12.02929995751502 and parameters: {'objective': 'rmse', 'n_estimators': 2199, 'reg_alpha': 3.52333879723585e-08, 'reg_lambda': 0.3354370622559683, 'colsample_bytree': 0.14, 'num_leaves': 107, 'feature_fraction': 0.2920371040495371, 'bagging_fraction': 0.14575387350666788, 'bagging_freq': 12, 'min_child_samples': 160, 'subsample': 0.84, 'learning_rate': 0.1612859672557736, 'max_depth': 100, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[737]	training's l1: 9.33217	training's rmse: 12.1596	valid_1's l1: 9.36071	valid_1's rmse: 12.0293
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.41559	training's rmse: 12.1584	valid_1's l1: 9.43309	valid_1's rmse: 11.9964
[1000]	training's l1: 9.26256	training's rmse: 11.9914	valid_1's l1: 9.30454	valid_1's rmse: 11.8935
Did not meet early stopping. Best iteration is:
[1188]	training's l1: 9.21454	training's rmse: 11.9664	valid_1's l1: 9.27202	valid_1's rmse: 11.8804


[32m[I 2023-03-04 01:36:28,736][0m Trial 10 finished with value: 11.88037149128063 and parameters: {'objective': 'rmse', 'n_estimators': 1206, 'reg_alpha': 6.030005358120916, 'reg_lambda': 2.372589462017527e-08, 'colsample_bytree': 0.64, 'num_leaves': 942, 'feature_fraction': 0.13488630418733316, 'bagging_fraction': 0.51104098189319, 'bagging_freq': 9, 'min_child_samples': 4, 'subsample': 0.2, 'learning_rate': 0.04761854081686318, 'max_depth': 1, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.96875	training's rmse: 11.643	valid_1's l1: 9.26719	valid_1's rmse: 11.8223


[32m[I 2023-03-04 01:36:33,074][0m Trial 11 finished with value: 11.817185011933331 and parameters: {'objective': 'rmse', 'n_estimators': 3223, 'reg_alpha': 4.867082727870175e-05, 'reg_lambda': 1.0317256761713962e-06, 'colsample_bytree': 0.56, 'num_leaves': 690, 'feature_fraction': 0.6006244449830316, 'bagging_fraction': 0.43776646697137195, 'bagging_freq': 0, 'min_child_samples': 190, 'subsample': 0.73, 'learning_rate': 0.0119582502551764, 'max_depth': 40, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


[1000]	training's l1: 8.7544	training's rmse: 11.3754	valid_1's l1: 9.30046	valid_1's rmse: 11.8753
Early stopping, best iteration is:
[537]	training's l1: 8.94217	training's rmse: 11.6151	valid_1's l1: 9.25894	valid_1's rmse: 11.8172
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.33021	training's rmse: 12.105	valid_1's l1: 9.29275	valid_1's rmse: 11.8841
[1000]	training's l1: 9.24081	training's rmse: 12.009	valid_1's l1: 9.25521	valid_1's rmse: 11.8532


[32m[I 2023-03-04 01:36:34,783][0m Trial 12 finished with value: 11.84834026095597 and parameters: {'objective': 'rmse', 'n_estimators': 1932, 'reg_alpha': 6.138086215146273e-05, 'reg_lambda': 2.0500155202410088e-08, 'colsample_bytree': 0.26, 'num_leaves': 720, 'feature_fraction': 0.7234440089239789, 'bagging_fraction': 0.3960843831336418, 'bagging_freq': 2, 'min_child_samples': 68, 'subsample': 0.33999999999999997, 'learning_rate': 0.03545020884620815, 'max_depth': 1, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


[1500]	training's l1: 9.19715	training's rmse: 11.9526	valid_1's l1: 9.2728	valid_1's rmse: 11.8653
Early stopping, best iteration is:
[1084]	training's l1: 9.22239	training's rmse: 11.998	valid_1's l1: 9.24028	valid_1's rmse: 11.8483
Training until validation scores don't improve for 500 rounds


[32m[I 2023-03-04 01:36:35,886][0m Trial 13 finished with value: 11.955558739861324 and parameters: {'objective': 'rmse', 'n_estimators': 3114, 'reg_alpha': 1.0668937725293697e-06, 'reg_lambda': 3.9796899030108636e-05, 'colsample_bytree': 0.78, 'num_leaves': 285, 'feature_fraction': 0.5482980357266252, 'bagging_fraction': 0.33149527682477303, 'bagging_freq': 3, 'min_child_samples': 199, 'subsample': 0.78, 'learning_rate': 0.07050808855828877, 'max_depth': 25, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


[500]	training's l1: 9.21366	training's rmse: 11.8727	valid_1's l1: 9.43211	valid_1's rmse: 12.0228
Early stopping, best iteration is:
[104]	training's l1: 9.42087	training's rmse: 12.1668	valid_1's l1: 9.36906	valid_1's rmse: 11.9556
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.05209	training's rmse: 10.4651	valid_1's l1: 9.42802	valid_1's rmse: 12.0649


[32m[I 2023-03-04 01:36:40,605][0m Trial 14 finished with value: 11.830309400215066 and parameters: {'objective': 'rmse', 'n_estimators': 4959, 'reg_alpha': 0.0004301197545932296, 'reg_lambda': 3.3937976038079794e-07, 'colsample_bytree': 1.0, 'num_leaves': 858, 'feature_fraction': 0.9606074581784647, 'bagging_fraction': 0.48283583942961344, 'bagging_freq': 0, 'min_child_samples': 77, 'subsample': 0.97, 'learning_rate': 0.02219082116951267, 'max_depth': 55, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[146]	training's l1: 8.77657	training's rmse: 11.3735	valid_1's l1: 9.2763	valid_1's rmse: 11.8303
Training until validation scores don't improve for 500 rounds


[32m[I 2023-03-04 01:36:41,477][0m Trial 15 finished with value: 11.959702279358272 and parameters: {'objective': 'rmse', 'n_estimators': 3762, 'reg_alpha': 9.569368597940617e-07, 'reg_lambda': 1.29172291323561e-08, 'colsample_bytree': 0.54, 'num_leaves': 610, 'feature_fraction': 0.8250942660932632, 'bagging_fraction': 0.24283189133835276, 'bagging_freq': 6, 'min_child_samples': 196, 'subsample': 0.6799999999999999, 'learning_rate': 0.24350391116098882, 'max_depth': 21, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


[500]	training's l1: 9.10988	training's rmse: 11.785	valid_1's l1: 9.54358	valid_1's rmse: 12.1942
Early stopping, best iteration is:
[40]	training's l1: 9.48182	training's rmse: 12.2683	valid_1's l1: 9.36709	valid_1's rmse: 11.9597
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 7.38142	training's rmse: 9.68269	valid_1's l1: 9.69835	valid_1's rmse: 12.5038


[32m[I 2023-03-04 01:36:45,376][0m Trial 16 finished with value: 11.886428834074767 and parameters: {'objective': 'rmse', 'n_estimators': 1479, 'reg_alpha': 0.0005683103465908196, 'reg_lambda': 6.225443264966211e-06, 'colsample_bytree': 0.29, 'num_leaves': 277, 'feature_fraction': 0.998875693099493, 'bagging_fraction': 0.3956424789094028, 'bagging_freq': 6, 'min_child_samples': 31, 'subsample': 0.45999999999999996, 'learning_rate': 0.05459704129772355, 'max_depth': 57, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[50]	training's l1: 8.89319	training's rmse: 11.5243	valid_1's l1: 9.30736	valid_1's rmse: 11.8864
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.99509	training's rmse: 11.6931	valid_1's l1: 9.2231	valid_1's rmse: 11.7976


[32m[I 2023-03-04 01:36:47,673][0m Trial 17 finished with value: 11.783695749768418 and parameters: {'objective': 'rmse', 'n_estimators': 2727, 'reg_alpha': 1.8412483536139508e-08, 'reg_lambda': 0.0002671560855668307, 'colsample_bytree': 0.8800000000000001, 'num_leaves': 757, 'feature_fraction': 0.8020788402060405, 'bagging_fraction': 0.5805427154808274, 'bagging_freq': 2, 'min_child_samples': 165, 'subsample': 0.83, 'learning_rate': 0.016845887678279103, 'max_depth': 20, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[296]	training's l1: 9.12966	training's rmse: 11.8464	valid_1's l1: 9.2116	valid_1's rmse: 11.7837
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.30156	training's rmse: 12.0279	valid_1's l1: 9.26813	valid_1's rmse: 11.8186
[1000]	training's l1: 9.1325	training's rmse: 11.861	valid_1's l1: 9.22769	valid_1's rmse: 11.7911


[32m[I 2023-03-04 01:36:50,950][0m Trial 18 finished with value: 11.784137310495424 and parameters: {'objective': 'rmse', 'n_estimators': 2240, 'reg_alpha': 1.128776959750403e-05, 'reg_lambda': 4.4550641491609137e-07, 'colsample_bytree': 0.68, 'num_leaves': 589, 'feature_fraction': 0.5490399987823426, 'bagging_fraction': 0.5527607890729442, 'bagging_freq': 4, 'min_child_samples': 228, 'subsample': 0.63, 'learning_rate': 0.010034932759269118, 'max_depth': 41, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[936]	training's l1: 9.14019	training's rmse: 11.8761	valid_1's l1: 9.22157	valid_1's rmse: 11.7841
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.1335	training's rmse: 11.8401	valid_1's l1: 9.31382	valid_1's rmse: 11.8707


[32m[I 2023-03-04 01:36:52,100][0m Trial 19 finished with value: 11.883299682438428 and parameters: {'objective': 'rmse', 'n_estimators': 3021, 'reg_alpha': 5.815303679740849e-07, 'reg_lambda': 1.5241584342062626e-07, 'colsample_bytree': 0.6200000000000001, 'num_leaves': 410, 'feature_fraction': 0.664393796828805, 'bagging_fraction': 0.19066207137387284, 'bagging_freq': 11, 'min_child_samples': 92, 'subsample': 0.14, 'learning_rate': 0.036440113287855784, 'max_depth': 10, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[132]	training's l1: 9.31013	training's rmse: 12.0985	valid_1's l1: 9.26732	valid_1's rmse: 11.8833
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 7.48449	training's rmse: 9.77712	valid_1's l1: 9.54325	valid_1's rmse: 12.2374


[32m[I 2023-03-04 01:36:59,157][0m Trial 20 finished with value: 11.882173542633502 and parameters: {'objective': 'rmse', 'n_estimators': 4190, 'reg_alpha': 2.1256105577890324e-07, 'reg_lambda': 3.6070571708602645e-06, 'colsample_bytree': 0.49, 'num_leaves': 875, 'feature_fraction': 0.8354353107005754, 'bagging_fraction': 0.10849042419382163, 'bagging_freq': 0, 'min_child_samples': 49, 'subsample': 0.30000000000000004, 'learning_rate': 0.024444796338159054, 'max_depth': 60, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[131]	training's l1: 8.51081	training's rmse: 11.0421	valid_1's l1: 9.32827	valid_1's rmse: 11.8822
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.98206	training's rmse: 11.6662	valid_1's l1: 9.23044	valid_1's rmse: 11.8114


[32m[I 2023-03-04 01:37:02,406][0m Trial 21 finished with value: 11.787443920266481 and parameters: {'objective': 'rmse', 'n_estimators': 2599, 'reg_alpha': 1.2863113836693392e-08, 'reg_lambda': 0.00024499904186749, 'colsample_bytree': 0.8700000000000001, 'num_leaves': 795, 'feature_fraction': 0.7751787461105153, 'bagging_fraction': 0.5473037647140888, 'bagging_freq': 2, 'min_child_samples': 146, 'subsample': 0.82, 'learning_rate': 0.01646719350274596, 'max_depth': 17, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[290]	training's l1: 9.12681	training's rmse: 11.8308	valid_1's l1: 9.21116	valid_1's rmse: 11.7874
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.197	training's rmse: 11.9299	valid_1's l1: 9.21861	valid_1's rmse: 11.7838


[32m[I 2023-03-04 01:37:04,806][0m Trial 22 finished with value: 11.77923516316003 and parameters: {'objective': 'rmse', 'n_estimators': 2842, 'reg_alpha': 1.0464471782509315e-07, 'reg_lambda': 0.00010143002212040106, 'colsample_bytree': 0.92, 'num_leaves': 633, 'feature_fraction': 0.8505124831329371, 'bagging_fraction': 0.3505123448939352, 'bagging_freq': 2, 'min_child_samples': 174, 'subsample': 0.77, 'learning_rate': 0.015616623027404342, 'max_depth': 29, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[474]	training's l1: 9.2014	training's rmse: 11.9397	valid_1's l1: 9.21492	valid_1's rmse: 11.7792
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.23006	training's rmse: 11.979	valid_1's l1: 9.2232	valid_1's rmse: 11.7988


[32m[I 2023-03-04 01:37:06,777][0m Trial 23 finished with value: 11.792872253464449 and parameters: {'objective': 'rmse', 'n_estimators': 3315, 'reg_alpha': 1.5639892865633581e-07, 'reg_lambda': 3.660295353489754e-05, 'colsample_bytree': 0.76, 'num_leaves': 617, 'feature_fraction': 0.8857199897801749, 'bagging_fraction': 0.347533165337656, 'bagging_freq': 3, 'min_child_samples': 176, 'subsample': 0.75, 'learning_rate': 0.01318759353326365, 'max_depth': 33, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[489]	training's l1: 9.23297	training's rmse: 11.9849	valid_1's l1: 9.21745	valid_1's rmse: 11.7929
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.54376	training's rmse: 12.2804	valid_1's l1: 9.41588	valid_1's rmse: 12.0067
[1000]	training's l1: 9.4117	training's rmse: 12.1504	valid_1's l1: 9.37295	valid_1's rmse: 11.9647


[32m[I 2023-03-04 01:37:09,047][0m Trial 24 finished with value: 11.960857677528427 and parameters: {'objective': 'rmse', 'n_estimators': 2158, 'reg_alpha': 1.007715790731171e-05, 'reg_lambda': 0.0008810046386344578, 'colsample_bytree': 0.92, 'num_leaves': 308, 'feature_fraction': 0.8789203530293646, 'bagging_fraction': 0.25915963478163706, 'bagging_freq': 1, 'min_child_samples': 216, 'subsample': 0.6799999999999999, 'learning_rate': 0.01021663192690305, 'max_depth': 9, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[872]	training's l1: 9.42683	training's rmse: 12.1713	valid_1's l1: 9.36496	valid_1's rmse: 11.9609
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.89975	training's rmse: 11.579	valid_1's l1: 9.27065	valid_1's rmse: 11.8404


[32m[I 2023-03-04 01:37:11,110][0m Trial 25 finished with value: 11.788427240232291 and parameters: {'objective': 'rmse', 'n_estimators': 2883, 'reg_alpha': 1.1471156825227754e-07, 'reg_lambda': 7.925290696025868e-08, 'colsample_bytree': 0.8, 'num_leaves': 633, 'feature_fraction': 0.7519551140316281, 'bagging_fraction': 0.4247048728392847, 'bagging_freq': 1, 'min_child_samples': 129, 'subsample': 1.0, 'learning_rate': 0.027353875766243978, 'max_depth': 48, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[216]	training's l1: 9.09385	training's rmse: 11.8211	valid_1's l1: 9.20989	valid_1's rmse: 11.7884
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.8201	valid_1's l1: 9.14211


[32m[I 2023-03-04 01:37:13,176][0m Trial 26 finished with value: 11.835289068332205 and parameters: {'objective': 'mae', 'n_estimators': 3486, 'reg_alpha': 1.6292101318674802e-08, 'reg_lambda': 8.503788538225659e-07, 'colsample_bytree': 0.24, 'num_leaves': 486, 'feature_fraction': 0.955608016244999, 'bagging_fraction': 0.3456356411962576, 'bagging_freq': 5, 'min_child_samples': 110, 'subsample': 0.9, 'learning_rate': 0.01915831798408998, 'max_depth': 29, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[297]	training's l1: 8.9454	valid_1's l1: 9.12195
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.61721	training's rmse: 12.3716	valid_1's l1: 9.45833	valid_1's rmse: 12.0588
[1000]	training's l1: 9.51118	training's rmse: 12.2743	valid_1's l1: 9.40354	valid_1's rmse: 12.0039


[32m[I 2023-03-04 01:37:15,132][0m Trial 27 finished with value: 12.002257647871781 and parameters: {'objective': 'rmse', 'n_estimators': 1717, 'reg_alpha': 2.973776780661784e-06, 'reg_lambda': 9.059750843405985e-06, 'colsample_bytree': 0.5900000000000001, 'num_leaves': 987, 'feature_fraction': 0.8490478425870175, 'bagging_fraction': 0.21358964005671868, 'bagging_freq': 3, 'min_child_samples': 255, 'subsample': 0.53, 'learning_rate': 0.017016212365340972, 'max_depth': 14, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[987]	training's l1: 9.50697	training's rmse: 12.2766	valid_1's l1: 9.39998	valid_1's rmse: 12.0023
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 9.39032	training's rmse: 12.0983	valid_1's l1: 9.48384	valid_1's rmse: 12.0257


[32m[I 2023-03-04 01:37:16,294][0m Trial 28 finished with value: 12.004500954439989 and parameters: {'objective': 'rmse', 'n_estimators': 2476, 'reg_alpha': 2.436233195861919e-07, 'reg_lambda': 1.3955571167343825e-06, 'colsample_bytree': 0.7000000000000001, 'num_leaves': 541, 'feature_fraction': 0.7183251526350674, 'bagging_fraction': 0.2919727962473456, 'bagging_freq': 9, 'min_child_samples': 215, 'subsample': 0.64, 'learning_rate': 0.02892292498500479, 'max_depth': 49, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m


Early stopping, best iteration is:
[264]	training's l1: 9.46049	training's rmse: 12.1961	valid_1's l1: 9.43953	valid_1's rmse: 12.0045
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 8.51552	training's rmse: 11.0646	valid_1's l1: 9.37412	valid_1's rmse: 11.9619


[32m[I 2023-03-04 01:37:18,672][0m Trial 29 finished with value: 11.8083331677816 and parameters: {'objective': 'rmse', 'n_estimators': 1984, 'reg_alpha': 9.031226496284448e-08, 'reg_lambda': 8.541190597960264e-05, 'colsample_bytree': 0.37, 'num_leaves': 826, 'feature_fraction': 0.7662890167951467, 'bagging_fraction': 0.9292482930828743, 'bagging_freq': 5, 'min_child_samples': 179, 'subsample': 0.71, 'learning_rate': 0.0410247897345418, 'max_depth': 87, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 11.767030770953275.[0m
[32m[I 2023-03-04 01:37:18,685][0m A new study created in memory with name: no-name-f113fe3d-6818-4df9-a60d-e733f1c77ea8[0m


Early stopping, best iteration is:
[125]	training's l1: 8.98746	training's rmse: 11.6696	valid_1's l1: 9.23711	valid_1's rmse: 11.8083
Number of finished trials: 30
Best LGBM trial parameters: {'objective': 'rmse', 'n_estimators': 2535, 'reg_alpha': 0.0012119503588163708, 'reg_lambda': 1.786523924375265e-08, 'colsample_bytree': 0.7300000000000001, 'num_leaves': 621, 'feature_fraction': 0.683980716117509, 'bagging_fraction': 0.4591863247313571, 'bagging_freq': 1, 'min_child_samples': 210, 'subsample': 0.66, 'learning_rate': 0.0640344829979919, 'max_depth': 6, 'random_state': 42, 'n_jobs': 4}
Best score: 11.767030770953275


[32m[I 2023-03-04 01:37:20,068][0m Trial 0 finished with value: 11.906801459012655 and parameters: {'learning_rate': 0.14668493390006054, 'l2_leaf_reg': 1.3967730339951894, 'bagging_temperature': 0.9561849665473519, 'random_strength': 1.1920759239547785, 'depth': 10, 'min_data_in_leaf': 141}. Best is trial 0 with value: 11.906801459012655.[0m
[32m[I 2023-03-04 01:37:20,317][0m Trial 1 finished with value: 11.735592401366445 and parameters: {'learning_rate': 0.41819554307304574, 'l2_leaf_reg': 92.35291929846832, 'bagging_temperature': 0.4456459626783069, 'random_strength': 1.0685821968275158, 'depth': 6, 'min_data_in_leaf': 117}. Best is trial 1 with value: 11.735592401366445.[0m
[32m[I 2023-03-04 01:37:20,582][0m Trial 2 finished with value: 11.782583201182897 and parameters: {'learning_rate': 0.12060590790996903, 'l2_leaf_reg': 39.621871653060914, 'bagging_temperature': 13.192034140643006, 'random_strength': 1.2002048813209156, 'depth': 6, 'min_data_in_leaf': 286}. Best is tri

Number of finished trials: 30
Best Cat trial parameters: {'learning_rate': 0.41819554307304574, 'l2_leaf_reg': 92.35291929846832, 'bagging_temperature': 0.4456459626783069, 'random_strength': 1.0685821968275158, 'depth': 6, 'min_data_in_leaf': 117}
Best score: 11.735592401366445
CPU times: user 47min 2s, sys: 45.7 s, total: 47min 48s
Wall time: 12min 57s


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [29]:
train = create_folds(train, Config.N_FOLDS)
# train = create_strat_folds(train, TARGET, Config.N_FOLDS)

n_folds=5, seed=42


In [30]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index(ID, inplace=True)
oof.head()

Unnamed: 0_level_0,Strength,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10.38,2
1,23.52,3
2,36.96,3
3,39.05,4
4,74.19,4


In [31]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [32]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [33]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [34]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 26 µs, sys: 1e+03 ns, total: 27 µs
Wall time: 30.8 µs


In [35]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params3 = {
    "n_estimators": Config.N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}
    
lgbm_params = gpu_ify_lgbm(lgbm_params)

In [36]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror",
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params3 = {
    'n_estimators': Config.N_ESTIMATORS,
    'learning_rate': 0.05,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror'
}

if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [37]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)

In [38]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),
    "xgb3": xgb.XGBRegressor(**xgb_params3),


    "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm0": lgb.LGBMRegressor(),
    "lgbm3": lgb.LGBMRegressor(lgbm_params3),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),
}

## Tree Models

In [39]:
%%time

# model_lst = ["xgb3","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
model_lst = ["xgb_best_params", "lgbm_best_params", "cat_best_params","xgb3", "xgb1", "xgb2", "lgbm0", "lgbm1", "lgbm2", "lgbm3", "cat1", "cat2"]
# model_lst = = []
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=xgb_best_params
{}
fold: 1, Score: 9.481948653113601, Run Time: 18.72
fold: 2, Score: 9.391805223105354, Run Time: 18.74
fold: 3, Score: 9.532995392925535, Run Time: 17.84
fold: 4, Score: 9.638148265577486, Run Time: 18.69
fold: 5, Score: 9.685478982616639, Run Time: 17.96
Scores -> Adjusted: 9.44025150 , mean: 9.54607530, std: 0.10582380

=== Model Feature Importance ===
AgeInDays 0.2533945
SuperplasticizerComponent 0.113955066
CoarseAggregateComponent 0.107391626
CementComponent 0.10717395
FlyAshComponent 0.10671706
WaterComponent 0.10496383
FineAggregateComponent 0.10453541
BlastFurnaceSlag 0.101868555


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0,20.06859
1,31.06961
2,37.85812
3,43.0791
4,45.13815


Mode
=== Target Value Counts ===
Model Run Time: 92.13
Model=lgbm_best_params
{}
fold: 1, Score: 9.3988586206226, Run Time: 5.82
fold: 2, Score: 9.347401500802984, Run Time: 5.10
fold: 3, Score: 9.442804389768083, Run Time: 5.17
fold: 4, Score: 9.58360764915207, Run Time: 4.96
fold: 5, Score: 9.775701426733617, Run Time: 5.02
Scores -> Adjusted: 9.35519447 , mean: 9.50967472, std: 0.15448025

=== Model Feature Importance ===
FineAggregateComponent 0.21388977381331634
CementComponent 0.2049060210258044
CoarseAggregateComponent 0.1937559732398853
WaterComponent 0.13163427843262185
BlastFurnaceSlag 0.07817776361898694
SuperplasticizerComponent 0.07123287671232877
AgeInDays 0.07110544759477541
FlyAshComponent 0.03529786556228098


Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0,20.47193
1,35.24541
2,38.11814
3,41.74591
4,45.2236


Mode
=== Target Value Counts ===
Model Run Time: 26.22
Model=cat_best_params
{}
fold: 1, Score: 9.233596285920974, Run Time: 1.99
fold: 2, Score: 9.213950397174434, Run Time: 2.04
fold: 3, Score: 9.314460385940018, Run Time: 2.05
fold: 4, Score: 9.30742201420632, Run Time: 2.07
fold: 5, Score: 9.507924831421617, Run Time: 2.30
Scores -> Adjusted: 9.21143792 , mean: 9.31547078, std: 0.10403287

=== Model Feature Importance ===
AgeInDays 0.7040895533804734
SuperplasticizerComponent 0.08462103494536989
CementComponent 0.07553747889504661
WaterComponent 0.03945064927615014
FineAggregateComponent 0.03319777955775732
CoarseAggregateComponent 0.028718577744917564
BlastFurnaceSlag 0.019634764540601247
FlyAshComponent 0.014750161659683746


Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0,22.68451
1,36.51328
2,37.78211
3,43.81183
4,42.9902


Mode
=== Target Value Counts ===
Model Run Time: 10.59
Model=xgb3
{}
fold: 1, Score: 10.091091658861933, Run Time: 6.08
fold: 2, Score: 10.052488552594143, Run Time: 6.07
fold: 3, Score: 10.04896128609487, Run Time: 6.22
fold: 4, Score: 10.11183313533843, Run Time: 6.16
fold: 5, Score: 10.52715707648363, Run Time: 6.20
Scores -> Adjusted: 9.98433811 , mean: 10.16630634, std: 0.18196823

=== Model Feature Importance ===
AgeInDays 0.35628125
SuperplasticizerComponent 0.12839037
CoarseAggregateComponent 0.09888333
FineAggregateComponent 0.098086454
WaterComponent 0.08681369
FlyAshComponent 0.08379879
CementComponent 0.075342216
BlastFurnaceSlag 0.07240394


Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0,29.41444
1,33.95918
2,41.74735
3,40.83153
4,46.67374


Mode
=== Target Value Counts ===
Model Run Time: 31.00
Model=xgb1
{}
fold: 1, Score: 9.84209825928242, Run Time: 0.84
fold: 2, Score: 9.702522084170921, Run Time: 0.86
fold: 3, Score: 9.79027357074974, Run Time: 0.86
fold: 4, Score: 9.864710404892744, Run Time: 0.86
fold: 5, Score: 10.264338967842047, Run Time: 0.85
Scores -> Adjusted: 9.69882049 , mean: 9.89278866, std: 0.19396817

=== Model Feature Importance ===
AgeInDays 0.54361254
SuperplasticizerComponent 0.10752861
FineAggregateComponent 0.06606827
WaterComponent 0.06416693
CementComponent 0.060588468
CoarseAggregateComponent 0.06044707
FlyAshComponent 0.04993739
BlastFurnaceSlag 0.04765066


Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0,27.3127
1,33.55209
2,38.14085
3,40.91367
4,47.12215


Mode
=== Target Value Counts ===
Model Run Time: 4.43
Model=xgb2
{}
fold: 1, Score: 10.01058655567839, Run Time: 6.89
fold: 2, Score: 10.165726267767042, Run Time: 6.74
fold: 3, Score: 10.063503279037546, Run Time: 6.90
fold: 4, Score: 10.036424198838763, Run Time: 7.03
fold: 5, Score: 10.35826922186429, Run Time: 6.84
Scores -> Adjusted: 9.99980114 , mean: 10.12690190, std: 0.12710077

=== Model Feature Importance ===
AgeInDays 0.4475332
SuperplasticizerComponent 0.09791386
WaterComponent 0.08898847
CoarseAggregateComponent 0.0823762
FineAggregateComponent 0.08062078
FlyAshComponent 0.07351577
CementComponent 0.06876296
BlastFurnaceSlag 0.060288776


Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0,29.50892
1,33.15569
2,44.59445
3,40.74338
4,46.37291


Mode
=== Target Value Counts ===
Model Run Time: 34.65
Model=lgbm0
{}
fold: 1, Score: 9.39625791932152, Run Time: 0.59
fold: 2, Score: 9.420380264436687, Run Time: 0.58
fold: 3, Score: 9.322624712720717, Run Time: 0.61
fold: 4, Score: 9.620820265945634, Run Time: 0.61
fold: 5, Score: 9.702745000107736, Run Time: 0.63
Scores -> Adjusted: 9.34835021 , mean: 9.49256563, std: 0.14421542

=== Model Feature Importance ===
FineAggregateComponent 0.163
CementComponent 0.162
CoarseAggregateComponent 0.15933333333333333
WaterComponent 0.13733333333333334
SuperplasticizerComponent 0.11466666666666667
AgeInDays 0.10666666666666667
BlastFurnaceSlag 0.08766666666666667
FlyAshComponent 0.06933333333333333


Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0,21.67873
1,34.60661
2,40.2935
3,41.25614
4,47.89707


Mode
=== Target Value Counts ===
Model Run Time: 3.19
Model=lgbm1
{}
fold: 1, Score: 9.417488572731047, Run Time: 0.61
fold: 2, Score: 9.448855006985498, Run Time: 0.58
fold: 3, Score: 9.355521851267918, Run Time: 0.57
fold: 4, Score: 9.654747967273734, Run Time: 0.54
fold: 5, Score: 9.769280049993588, Run Time: 0.54
Scores -> Adjusted: 9.37265420 , mean: 9.52917869, std: 0.15652449

=== Model Feature Importance ===
FineAggregateComponent 0.18101659751037344
CementComponent 0.15871369294605808
CoarseAggregateComponent 0.15404564315352698
WaterComponent 0.1400414937759336
SuperplasticizerComponent 0.11514522821576763
AgeInDays 0.09024896265560166
BlastFurnaceSlag 0.08402489626556017
FlyAshComponent 0.07676348547717843


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0,23.60652
1,35.5162
2,40.99849
3,40.25634
4,47.27227


Mode
=== Target Value Counts ===
Model Run Time: 2.99
Model=lgbm2
{}
fold: 1, Score: 9.946166290378631, Run Time: 0.25
fold: 2, Score: 9.905541427748044, Run Time: 0.25
fold: 3, Score: 10.014246391238743, Run Time: 0.25
fold: 4, Score: 9.93594895440473, Run Time: 0.27
fold: 5, Score: 9.971055719158814, Run Time: 0.34
Scores -> Adjusted: 9.91811104 , mean: 9.95459176, std: 0.03648072

=== Model Feature Importance ===
FineAggregateComponent 0.214
CementComponent 0.178
SuperplasticizerComponent 0.164
AgeInDays 0.106
CoarseAggregateComponent 0.095
FlyAshComponent 0.095
BlastFurnaceSlag 0.081
WaterComponent 0.067


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0,27.9079
1,37.75881
2,34.09795
3,42.3843
4,40.2823


Mode
=== Target Value Counts ===
Model Run Time: 1.51
Model=lgbm3
{}
fold: 1, Score: 9.417488572731047, Run Time: 0.61
fold: 2, Score: 9.448855006985498, Run Time: 0.58
fold: 3, Score: 9.355521851267918, Run Time: 0.56
fold: 4, Score: 9.654747967273734, Run Time: 0.47
fold: 5, Score: 9.769280049993588, Run Time: 0.53
Scores -> Adjusted: 9.37265420 , mean: 9.52917869, std: 0.15652449

=== Model Feature Importance ===
FineAggregateComponent 0.18101659751037344
CementComponent 0.15871369294605808
CoarseAggregateComponent 0.15404564315352698
WaterComponent 0.1400414937759336
SuperplasticizerComponent 0.11514522821576763
AgeInDays 0.09024896265560166
BlastFurnaceSlag 0.08402489626556017
FlyAshComponent 0.07676348547717843


Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0,23.60652
1,35.5162
2,40.99849
3,40.25634
4,47.27227


Mode
=== Target Value Counts ===
Model Run Time: 2.89
Model=cat1
{}
fold: 1, Score: 9.19104246464739, Run Time: 2.11
fold: 2, Score: 9.174669405745702, Run Time: 2.14
fold: 3, Score: 9.338993063036588, Run Time: 2.03
fold: 4, Score: 9.29695125676774, Run Time: 2.06
fold: 5, Score: 9.521023900602884, Run Time: 2.00
Scores -> Adjusted: 9.17974985 , mean: 9.30453602, std: 0.12478617

=== Model Feature Importance ===
AgeInDays 0.5790244848373505
CementComponent 0.09899117001309628
SuperplasticizerComponent 0.08080817745726981
WaterComponent 0.06362168764728943
FineAggregateComponent 0.058179071466406634
CoarseAggregateComponent 0.04994742137378766
BlastFurnaceSlag 0.03925279814818057
FlyAshComponent 0.030175189056619138


Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0,21.46285
1,36.4238
2,37.67066
3,43.58667
4,44.68501


Mode
=== Target Value Counts ===
Model Run Time: 10.48
Model=cat2
{}
fold: 1, Score: 9.209502263196073, Run Time: 6.21
fold: 2, Score: 9.230605090297756, Run Time: 5.77
fold: 3, Score: 9.3322001648631, Run Time: 5.79
fold: 4, Score: 9.347803937945946, Run Time: 5.72
fold: 5, Score: 9.529095850137951, Run Time: 5.74
Scores -> Adjusted: 9.21638671 , mean: 9.32984146, std: 0.11345475

=== Model Feature Importance ===
AgeInDays 0.5482160712949171
CementComponent 0.09169667051442237
FineAggregateComponent 0.06694947917712207
SuperplasticizerComponent 0.06660258008244736
WaterComponent 0.06318902698492701
BlastFurnaceSlag 0.057525103289272954
CoarseAggregateComponent 0.05746696677805677
FlyAshComponent 0.048354101878834335


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0,23.64196
1,33.15506
2,38.78602
3,41.95718
4,44.01749


Mode
=== Target Value Counts ===
Model Run Time: 29.37
CPU times: user 10min 33s, sys: 43.6 s, total: 11min 17s
Wall time: 4min 9s


Unnamed: 0,Model,Score,StdDev,RunTime
3,xgb3,10.16631,0.18197,30.99562
5,xgb2,10.1269,0.1271,34.64897
8,lgbm2,9.95459,0.03648,1.50999
4,xgb1,9.89279,0.19397,4.42751
0,xgb_best_params,9.54608,0.10582,92.13266
7,lgbm1,9.52918,0.15652,2.98532
9,lgbm3,9.52918,0.15652,2.89308
1,lgbm_best_params,9.50967,0.15448,26.21809
6,lgbm0,9.49257,0.14422,3.18612
11,cat2,9.32984,0.11345,29.37384


## Linear Models

In [40]:
model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
model_lst = ["lasso", "ridge",  "ridge_50"]
# model_lst = []
# all_cv_scores = run_models4features(model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

all_cv_scores.head()

Model=lasso
fold: 1, Score: 11.822677488454982, Run Time: 0.04
fold: 2, Score: 11.641289509965215, Run Time: 0.06
fold: 3, Score: 11.791907114570881, Run Time: 0.06
fold: 4, Score: 11.646972288117684, Run Time: 0.06
fold: 5, Score: 11.598603795765076, Run Time: 0.06
Scores -> Adjusted: 11.61080673 , mean: 11.70029004, std: 0.08948331


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0,31.98752
1,34.68057
2,31.58559
3,55.02455
4,35.1059


Mode
=== Target Value Counts ===
Model Run Time: 0.50
Model=ridge
fold: 1, Score: 11.463442883896986, Run Time: 0.06
fold: 2, Score: 11.479100804595532, Run Time: 0.06
fold: 3, Score: 11.600767405949217, Run Time: 0.06
fold: 4, Score: 11.29006557587803, Run Time: 0.06
fold: 5, Score: 11.354798113056587, Run Time: 0.06
Scores -> Adjusted: 11.33026985 , mean: 11.43763496, std: 0.10736510


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0,33.20416
1,35.6737
2,31.70702
3,58.51655
4,36.64969


Mode
=== Target Value Counts ===
Model Run Time: 0.53
Model=ridge_50
fold: 1, Score: 11.463355083703659, Run Time: 0.04
fold: 2, Score: 11.479104311599201, Run Time: 0.06
fold: 3, Score: 11.600747681554758, Run Time: 0.06
fold: 4, Score: 11.289980742512466, Run Time: 0.06
fold: 5, Score: 11.354759002595294, Run Time: 0.06
Scores -> Adjusted: 11.33020485 , mean: 11.43758936, std: 0.10738452


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0,33.20397
1,35.67351
2,31.70739
3,58.51938
4,36.65004


Mode
=== Target Value Counts ===
Model Run Time: 0.50


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb_best_params,9.54608,0.10582,92.13266
1,lgbm_best_params,9.50967,0.15448,26.21809
2,cat_best_params,9.31547,0.10403,10.59165
3,xgb3,10.16631,0.18197,30.99562
4,xgb1,9.89279,0.19397,4.42751


In [41]:
sample_submission.head(20)

Unnamed: 0,id,Strength,target_xgb_best_params,target_lgbm_best_params,target_cat_best_params,target_xgb3,target_xgb1,target_xgb2,target_lgbm0,target_lgbm1,target_lgbm2,target_lgbm3,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,5407,35.452,41.27099,48.20132,47.35529,42.91861,43.26007,45.34205,46.94644,49.61002,43.91831,49.61002,47.85299,46.12442,34.55594,35.23381,35.23382
1,5408,35.452,21.64617,18.48537,18.42723,20.01837,15.33723,20.29089,18.54209,18.02727,24.7372,18.02727,18.78228,19.26305,29.61738,26.89847,26.89708
2,5409,35.452,31.3607,32.99111,34.20375,31.82274,31.07011,31.70139,32.7445,32.72242,31.99861,32.72242,33.59458,33.4355,30.29926,26.20377,26.20286
3,5410,35.452,46.52755,44.41577,46.97602,45.75269,45.13286,45.45034,45.77196,44.727,42.0995,44.727,46.51212,46.12856,39.21824,38.68754,38.68819
4,5411,35.452,26.79205,24.99973,29.02477,23.04825,26.446,23.4889,29.08497,25.36981,31.0911,25.36981,28.4028,31.25659,32.83587,31.91047,31.90966
5,5412,35.452,43.55702,42.55169,39.59243,47.72512,43.3175,45.478,44.11692,41.72337,40.55855,41.72337,39.88214,39.45014,34.43591,34.81558,34.81534
6,5413,35.452,26.72514,31.62282,30.04489,27.46004,22.61499,26.61395,27.93785,27.65293,34.81447,27.65293,32.63767,33.61909,30.92351,26.85027,26.84672
7,5414,35.452,20.71156,19.61079,20.2634,20.55914,18.60314,21.80172,20.97477,21.29445,25.77507,21.29445,21.57829,21.6947,29.86829,31.18245,31.18225
8,5415,35.452,45.15852,44.94705,41.187,40.63682,48.45671,44.79938,45.31784,50.09437,41.89705,50.09437,43.45535,42.71252,35.12752,39.92727,39.92773
9,5416,35.452,35.05716,33.03719,37.30857,27.41819,29.428,29.98704,36.73164,35.8751,35.73388,35.8751,37.70526,37.2384,33.81522,32.64063,32.63977


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [42]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [43]:
model_lst

['lasso', 'ridge', 'ridge_50']

In [44]:
model_lst = ["xgb1", "xgb2", "cat1", "lgbm0", "lgbm1"]

In [45]:
len(model_lst)

5

In [46]:
target_names = [f"target_{model}" for model in model_lst]
target_names

['target_xgb1', 'target_xgb2', 'target_cat1', 'target_lgbm0', 'target_lgbm1']

In [47]:
sample_submission[TARGET] = sample_submission[target_names].sum(axis=1) / len(model_lst)

In [48]:
sample_submission[[ID, TARGET]].to_csv("submission_models_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,17.58183
3598,9005,39.03937
3599,9006,16.48738
3600,9007,27.41077
3601,9008,32.15866
3602,9009,41.12958
3603,9010,28.79385
3604,9011,21.18603


In [49]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 3 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

# sample_submission[TARGET] = sample_submission[TARGET].astype(int)

In [50]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,17.9493
3598,9005,38.85651
3599,9006,16.64019
3600,9007,26.14645
3601,9008,31.3641
3602,9009,42.12145
3603,9010,29.00549
3604,9011,21.29932


In [51]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
12,lasso,11.70029,0.08948,0.50498
13,ridge,11.43763,0.10737,0.53013
14,ridge_50,11.43759,0.10738,0.50499
3,xgb3,10.16631,0.18197,30.99562
5,xgb2,10.1269,0.1271,34.64897
8,lgbm2,9.95459,0.03648,1.50999
4,xgb1,9.89279,0.19397,4.42751
0,xgb_best_params,9.54608,0.10582,92.13266
7,lgbm1,9.52918,0.15652,2.98532
9,lgbm3,9.52918,0.15652,2.89308


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [52]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [53]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [54]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
   id  pred_cat1
0   0   21.46285
1   1   36.42380
2   2   37.67066
3   3   43.58667
4   4   44.68501
Processing train_pred_cat2, train_pred_cat2.csv
   id  pred_cat2
0   0   23.64196
1   1   33.15506
2   2   38.78602
3   3   41.95718
4   4   44.01749
Processing train_pred_lgbm1, train_pred_lgbm1.csv
   id  pred_lgbm1
0   0    23.60652
1   1    35.51620
2   2    40.99849
3   3    40.25634
4   4    47.27227
Processing train_pred_lgbm2, train_pred_lgbm2.csv
   id  pred_lgbm2
0   0    27.90790
1   1    37.75881
2   2    34.09795
3   3    42.38430
4   4    40.28230
Processing train_pred_xgb1, train_pred_xgb1.csv
   id  pred_xgb1
0   0   27.31270
1   1   33.55209
2   2   38.14085
3   3   40.91367
4   4   47.12215
submission_cat1, submission_cat1.csv
     id  Strength
0  5407  47.85299
1  5408  18.78228
2  5409  33.59458
3  5410  46.51212
4  5411  28.40280
submission_cat2, submission_cat2.csv
     id  Strength
0  5407  46.12442
1  5408  19.2630

In [55]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,21.46285,23.64196,23.60652,27.9079,27.3127
1,36.4238,33.15506,35.5162,37.75881,33.55209
2,37.67066,38.78602,40.99849,34.09795,38.14085
3,43.58667,41.95718,40.25634,42.3843,40.91367
4,44.68501,44.01749,47.27227,40.2823,47.12215


In [56]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,47.85299,46.12442,49.61002,43.91831,43.26007
1,18.78228,19.26305,18.02727,24.7372,15.33723
2,33.59458,33.4355,32.72242,31.99861,31.07011
3,46.51212,46.12856,44.727,42.0995,45.13286
4,28.4028,31.25659,25.36981,31.0911,26.446


In [57]:
type(preds_df)

pandas.core.frame.DataFrame

In [58]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [59]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [60]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,21.46285,23.64196,23.60652,27.9079,27.3127
1,36.4238,33.15506,35.5162,37.75881,33.55209
2,37.67066,38.78602,40.99849,34.09795,38.14085
3,43.58667,41.95718,40.25634,42.3843,40.91367
4,44.68501,44.01749,47.27227,40.2823,47.12215


In [61]:
# preds_df[useful_features].head()

In [62]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [63]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [64]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
12,lasso,11.7,0.09,0.5
13,ridge,11.44,0.11,0.53
14,ridge_50,11.44,0.11,0.5
3,xgb3,10.17,0.18,31.0
5,xgb2,10.13,0.13,34.65
8,lgbm2,9.95,0.04,1.51
4,xgb1,9.89,0.19,4.43
0,xgb_best_params,9.55,0.11,92.13
7,lgbm1,9.53,0.16,2.99
9,lgbm3,9.53,0.16,2.89
