<a href="https://www.kaggle.com/code/mmellinger66/s3e9-concrete-strength-models?scriptVersionId=121800268" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 9 - Concrete Strength Models</h1>
</div>

## Problem Type

Regression

## Evaluation Metric

$$RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y_i})^2}$$

```python
score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
```

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="Strength"
ID="id"

# Optuna
objective_direction = "minimize"  # minimize, maximize

In [3]:
class Config:
    path:str = "../input/playground-series-s3e9/"
    load_original_data:bool = True # Some Competitions use synthetic data, based on real data
    original_data_path:str = "../input/predict-concrete-strength/ConcreteStrengthData.csv"
    gpu:bool = False
    optimize:bool = False
    n_optuna_trials:int = 50 # 5, 10, 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 2000  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5
        

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    """Deprecated: Use summary_statistics()"""
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    if Config.gpu:
        lgbm_dict["device"] = "gpu"
        lgbm_dict["boosting_type"] = "gbdt"
        lgbm_dict["gpu_platform_id"] = 0
        lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    if Config.gpu:
        params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
#         "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),

        "eval_metric": "rmse",  # auc, rmse, mae
        "objective": "reg:squarederror", # Normal Distribution
#         "objective": "reg:gamma", # Gamma Distribution

        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["hist"]
        ),  # hist, gpu_hist
#         "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    #  model = CatBoostClassifier(**cb_params)
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=5407, Columns=10
 test data : Rows=3605, Columns=9
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19



[1m[91m=== Train Column Names ===[0m



Index(['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent',
       'WaterComponent', 'SuperplasticizerComponent',
       'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays',
       'Strength'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

AgeInDays                    2.74687
SuperplasticizerComponent    1.41169
FlyAshComponent              1.30469
BlastFurnaceSlag             1.12120
Strength                     0.38073
CementComponent              0.34128
id                           0.00000
CoarseAggregateComponent    -0.08145
WaterComponent              -0.21528
FineAggregateComponent      -0.44738
dtype: float64
CPU times: user 54.9 ms, sys: 8 ms, total: 62.9 ms
Wall time: 105 ms


In [8]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19


In [9]:
def load_original_data(path:str) -> pd.DataFrame:
#     original = pd.read_csv(path, index_col=[0])
    original = pd.read_csv(path)

    original = original.reset_index()
    original['id'] = original['index'] + 100000
    original = original.drop(columns = ['index'])
    original = original.rename(columns = {'CementComponent ':'CementComponent'})
    original.set_index('id', inplace=True)
#     original = original[-original.depth.isna()]
    print(f"Shape={original.shape}")
    return original
#     original.head()

if Config.load_original_data:    
    original = load_original_data(Config.original_data_path)
    display(original.head())

Shape=(1030, 9)


Unnamed: 0_level_0,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100000,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
100001,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
100002,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
100003,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
100004,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [10]:
if Config.load_original_data:
    train['is_original']    = 0
    test['is_original']     = 0
    original['is_original'] = 1
#     combined = pd.concat([train, original], ignore_index=True) #.drop_duplicates()
    combined = pd.concat([train, original])

    train = combined
#     combined.head()
    print(f"Shape={combined.shape}")


Shape=(6437, 11)


In [11]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
CementComponent,6437.0,296.29,105.57,102.0,212.5,295.8,374.0,540.0,11145.39,0.37,-0.55
BlastFurnaceSlag,6437.0,61.06,84.06,0.0,0.0,0.0,129.9,359.4,7066.49,1.06,-0.11
FlyAshComponent,6437.0,35.44,56.8,0.0,0.0,0.0,94.0,200.1,3226.28,1.16,-0.28
WaterComponent,6437.0,184.51,19.04,121.8,173.0,186.0,192.0,247.0,362.59,-0.17,0.67
SuperplasticizerComponent,6437.0,4.44,5.79,0.0,0.0,0.0,8.7,32.2,33.51,1.3,1.92
CoarseAggregateComponent,6437.0,988.95,77.56,801.0,938.0,975.6,1047.0,1145.0,6014.85,-0.08,-0.56
FineAggregateComponent,6437.0,771.6,78.96,594.0,734.3,781.0,821.0,992.6,6234.31,-0.41,-0.02
AgeInDays,6437.0,50.78,68.99,1.0,7.0,28.0,56.0,365.0,4759.62,2.82,8.76
Strength,6437.0,35.51,16.45,2.33,23.69,33.96,45.85,82.6,270.61,0.39,-0.35
is_original,6437.0,0.16,0.37,0.0,0.0,0.0,0.0,1.0,0.13,1.86,1.44


## Outlier Detection

In [12]:
# https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
    
def iqr(data:pd.DataFrame, var:str):# outliers detecion .
    q1 = np.quantile(data[var], 0.25)
    q3 = np.quantile(data[var], 0.75)
    diff = q3 - q1
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    return data[(data[var] < lower_t) | (data[var] > upper_t)]

# iqr(train, "squareMeters")

In [13]:
# # https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy

# def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
#     outlier_percents = {}
#     for column in data.columns:
#         if data[column].dtype != object:
#             q1 = np.quantile(data[column], 0.25)
#             q3 = np.quantile(data[column], 0.75)
#             iqr = q3 - q1
#             upper_bound = q3 + (1.5 * iqr)
#             lower_bound = q1 - (1.5 * iqr)
#             outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
#             outlier_percentage = len(outliers) / len(data[column]) * 100
#             outlier_percents[column] = outlier_percentage
#             outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
#     return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

# detect_outliers(train)


In [14]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy
    
def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(test)

Unnamed: 0,Outlier_percentage
FineAggregateComponent,8.54369
WaterComponent,8.2663
AgeInDays,7.93343
SuperplasticizerComponent,1.47018
BlastFurnaceSlag,0.41609
id,0.0
CementComponent,0.0
FlyAshComponent,0.0
CoarseAggregateComponent,0.0
is_original,0.0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

## Handle Outliers
- https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
- https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

In [15]:
#https://www.kaggle.com/code/mmellinger66/s3e9-concrete-strength-models

def add_features(df):
    df['Water_Cement'] = df['WaterComponent'] / df['CementComponent']
    df['Coarse_Fine'] = df['CoarseAggregateComponent'] / df['FineAggregateComponent']
    df['Aggregate'] = df['CoarseAggregateComponent'] + df['FineAggregateComponent']
    df['Aggregate_Cement'] = df['Aggregate'] / df['CementComponent']
    df['Slag_Cement'] = df['BlastFurnaceSlag'] / df['CementComponent']
    df['Ash_Cement'] = df['FlyAshComponent'] / df['CementComponent']
    df['Plastic_Cement'] = df['SuperplasticizerComponent'] / df['CementComponent']
    df['Age_Water'] = df['AgeInDays'] / df['WaterComponent']
    return df

train = add_features(train)
test = add_features(test)

In [16]:
# features_with_outliers = []

In [17]:
# https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

def remove_outliers(df:pd.DataFrame) -> pd.DataFrame:
    for c in features_with_outliers:
        if c == 'garage':
            first_percentile = df[c].quantile(0.001)
            df = df[df[c] > first_percentile]

        ninety_ninth_percentile = df[c].quantile(0.999)
        df = df[df[c] < ninety_ninth_percentile]
        #df_t = df_t[(df_t[c] > first_percentile) & (df_t[c] < ninety_ninth_percentile)]
    return df


In [18]:
# print(f'Before: {len(train)}')
# train = remove_outliers(train)
# print(f'After: {len(train)}')

In [19]:
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength,is_original,Water_Cement,Coarse_Fine,Aggregate,Aggregate_Cement,Slag_Cement,Ash_Cement,Plastic_Cement,Age_Water
0,0.0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38,0,0.35429,1.83524,1738.0,3.31048,0.0,0.0,0.0,0.01613
1,1.0,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52,0,1.33566,1.50389,1610.0,11.25874,1.18182,1.0,0.05594,0.1466
2,2.0,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96,0,0.64256,1.35169,1870.3,6.47163,0.46609,0.0,0.0,0.15078
3,3.0,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05,0,0.75,1.39104,1602.0,5.26974,0.25,0.0,0.0,1.60088
4,4.0,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19,0,1.22293,1.19739,1716.6,10.93376,1.50318,0.0,0.0,0.46875
5,5.0,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43,0,0.58,1.36129,1830.0,5.22857,0.0,0.0,0.0,0.03448
6,6.0,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1,0,1.36846,1.41736,1835.5,13.52616,1.49963,0.0,0.0,0.15078
7,7.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94,0,0.68571,1.56902,1526.0,4.58947,0.42857,0.0,0.0,0.12281
8,8.0,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14,0,0.63043,1.2175,1774.0,5.50932,0.0,0.0,0.0,0.8867
9,9.0,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94,0,1.44361,1.1051,1766.6,13.28271,1.50376,0.0,0.0,0.01562


In [20]:
train = train.reset_index(drop=True).copy()
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength,is_original,Water_Cement,Coarse_Fine,Aggregate,Aggregate_Cement,Slag_Cement,Ash_Cement,Plastic_Cement,Age_Water
0,0.0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38,0,0.35429,1.83524,1738.0,3.31048,0.0,0.0,0.0,0.01613
1,1.0,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52,0,1.33566,1.50389,1610.0,11.25874,1.18182,1.0,0.05594,0.1466
2,2.0,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96,0,0.64256,1.35169,1870.3,6.47163,0.46609,0.0,0.0,0.15078
3,3.0,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05,0,0.75,1.39104,1602.0,5.26974,0.25,0.0,0.0,1.60088
4,4.0,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19,0,1.22293,1.19739,1716.6,10.93376,1.50318,0.0,0.0,0.46875
5,5.0,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43,0,0.58,1.36129,1830.0,5.22857,0.0,0.0,0.0,0.03448
6,6.0,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1,0,1.36846,1.41736,1835.5,13.52616,1.49963,0.0,0.0,0.15078
7,7.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94,0,0.68571,1.56902,1526.0,4.58947,0.42857,0.0,0.0,0.12281
8,8.0,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14,0,0.63043,1.2175,1774.0,5.50932,0.0,0.0,0.0,0.8867
9,9.0,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94,0,1.44361,1.1051,1766.6,13.28271,1.50376,0.0,0.0,0.01562


In [21]:
excluded_features = [TARGET, ID, "fold", "is_original"]

In [22]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength', 'is_original', 'Water_Cement', 'Coarse_Fine', 'Aggregate', 'Aggregate_Cement', 'Slag_Cement', 'Ash_Cement', 'Plastic_Cement', 'Age_Water']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['CementComponent',
 'BlastFurnaceSlag',
 'FlyAshComponent',
 'WaterComponent',
 'SuperplasticizerComponent',
 'CoarseAggregateComponent',
 'FineAggregateComponent',
 'AgeInDays',
 'Water_Cement',
 'Coarse_Fine',
 'Aggregate',
 'Aggregate_Cement',
 'Slag_Cement',
 'Ash_Cement',
 'Plastic_Cement',
 'Age_Water']

In [23]:
train, test = label_encoder(train, test, cat_features)
# train = pd.get_dummies(train,columns=[]) # Will remove original feature names
# test = pd.get_dummies(test,columns=[])

In [24]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength,is_original,Water_Cement,Coarse_Fine,Aggregate,Aggregate_Cement,Slag_Cement,Ash_Cement,Plastic_Cement,Age_Water
0,0.0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38,0,0.35429,1.83524,1738.0,3.31048,0.0,0.0,0.0,0.01613
1,1.0,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52,0,1.33566,1.50389,1610.0,11.25874,1.18182,1.0,0.05594,0.1466
2,2.0,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96,0,0.64256,1.35169,1870.3,6.47163,0.46609,0.0,0.0,0.15078
3,3.0,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05,0,0.75,1.39104,1602.0,5.26974,0.25,0.0,0.0,1.60088
4,4.0,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19,0,1.22293,1.19739,1716.6,10.93376,1.50318,0.0,0.0,0.46875


In [25]:
# cont_features, cat_features = feature_distribution_types(train, display=True)
# show_cardinality(train, cat_features)

# cont_features = [feature for feature in cont_features if feature not in excluded_features]
# cat_features = [feature for feature in cat_features if feature not in excluded_features]

# FEATURES = cont_features + cat_features
# FEATURES

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [26]:
%%time

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best XGB trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best LGBM trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
# best_cb_params = {}
best_cb_params = {'learning_rate': 0.45743264601999495,
                  'l2_leaf_reg': 41.338946049390074,
                  'bagging_temperature': 0.3472567739474319,
                  'random_strength': 1.7332249677756242, 
                  'depth': 1,
                  'min_data_in_leaf': 6}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best Cat trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

CPU times: user 16 µs, sys: 0 ns, total: 16 µs
Wall time: 20.3 µs


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [27]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [28]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [29]:
def run_linear_model(train:pd.DataFrame, test:pd.DataFrame, model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(train:pd.DataFrame, test:pd.DataFrame, model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [30]:
%%time

def run_models4features(train:pd.DataFrame, test:pd.DataFrame, model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(train, test, model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(train, test, model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time, "n_estimators": Config.N_ESTIMATORS, "n_folds": Config.N_FOLDS, "comments": ""}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 20 µs, sys: 0 ns, total: 20 µs
Wall time: 24.3 µs


In [31]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params3 = {
    "n_estimators": Config.N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}
    
lgbm_params = gpu_ify_lgbm(lgbm_params)

In [32]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror",
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params3 = {
    'n_estimators': Config.N_ESTIMATORS,
    'learning_rate': 0.05,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror'
}

if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [33]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)

In [34]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),
    "xgb3": xgb.XGBRegressor(**xgb_params3),


    "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm0": lgb.LGBMRegressor(),
    "lgbm3": lgb.LGBMRegressor(lgbm_params3),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),
}

In [35]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
        "n_estimators": pd.Series(dtype="int"),
        "n_folds": pd.Series(dtype="int"),
        "comments": pd.Series(dtype="str"),
    }
)



## Tree Models

In [36]:
%%time

# model_lst = ["xgb3","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
# model_lst = = []

def run_tree_models(X_tr, test, n_folds, model_lst, all_cv_scores):
        all_cv_scores = run_models4features(X_tr, test, model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

        all_cv_scores.sort_values(by=["Score"], ascending=False)
        return all_cv_scores

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 8.34 µs


## Linear Models

In [37]:
def run_linear_models(X_tr, test, n_folds, model_lst, all_cv_scores):
    for training in [train, train]:

    #     all_cv_scores = run_models4features(train, test, model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    
        all_cv_scores = run_models4features(X_tr, test, model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
        return all_cv_scores  

In [38]:
%%time


tree_model_lst = ["xgb_best_params", "lgbm_best_params", "cat_best_params","xgb3", "xgb1", "xgb2", "lgbm0", "lgbm1", "lgbm2", "lgbm3", "cat1", "cat2"]
linear_model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
linear_model_lst = ["lasso", "ridge",  "ridge_50"]

Config.N_FOLDS = 10

for training in [train, train]:
    X_tr = create_folds(train, Config.N_FOLDS)
    
    oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)
    oof.head()

    all_cv_scores = run_tree_models(X_tr, test, Config.N_FOLDS, tree_model_lst, all_cv_scores)
    all_cv_scores = run_linear_models(X_tr, test, Config.N_FOLDS, linear_model_lst, all_cv_scores)
    Config.N_FOLDS = 5



n_folds=10, seed=42
Model=xgb_best_params
{}
fold: 1, Score: 9.40939760131125, Run Time: 0.97
fold: 2, Score: 9.047081537631728, Run Time: 0.96
fold: 3, Score: 9.276157578296543, Run Time: 2.05
fold: 4, Score: 9.197350950181855, Run Time: 0.95
fold: 5, Score: 8.773080619432912, Run Time: 0.94
fold: 6, Score: 9.770478415607665, Run Time: 0.97
fold: 7, Score: 9.890915386158486, Run Time: 0.97
fold: 8, Score: 8.924111760496908, Run Time: 0.96
fold: 9, Score: 9.402192812514713, Run Time: 1.00
fold: 10, Score: 9.299300286692, Run Time: 0.98
Scores -> Adjusted: 8.96931869 , mean: 9.29900669, std: 0.32968800

=== Model Feature Importance ===
Age_Water 0.46525937
Water_Cement 0.10405575
Aggregate_Cement 0.04517857
FineAggregateComponent 0.04069826
Plastic_Cement 0.040458865
Ash_Cement 0.036044154
Coarse_Fine 0.035633285
SuperplasticizerComponent 0.035194296
Slag_Cement 0.033596754
BlastFurnaceSlag 0.029988864
CoarseAggregateComponent 0.029185941
Aggregate 0.027087746
WaterComponent 0.02587039


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0.0,26.64933
1.0,30.11574
2.0,41.29745
3.0,45.0497
4.0,45.37445


Mode
=== Target Value Counts ===
Model Run Time: 10.94
Model=lgbm_best_params
{}
fold: 1, Score: 9.181982433119156, Run Time: 0.85
fold: 2, Score: 8.577795535113337, Run Time: 0.93
fold: 3, Score: 8.940251098620836, Run Time: 1.00
fold: 4, Score: 8.763758013499409, Run Time: 0.83
fold: 5, Score: 8.591279310305096, Run Time: 0.90
fold: 6, Score: 9.458543508816517, Run Time: 0.83
fold: 7, Score: 9.576835494955603, Run Time: 0.83
fold: 8, Score: 8.715341523574672, Run Time: 0.94
fold: 9, Score: 9.247319110185964, Run Time: 0.92
fold: 10, Score: 9.084082721295868, Run Time: 0.91
Scores -> Adjusted: 8.67783297 , mean: 9.01371887, std: 0.33588590

=== Model Feature Importance ===
Age_Water 0.12633333333333333
CoarseAggregateComponent 0.08066666666666666
Water_Cement 0.07733333333333334
Plastic_Cement 0.07666666666666666
Coarse_Fine 0.07666666666666666
Aggregate 0.07633333333333334
WaterComponent 0.07366666666666667
Aggregate_Cement 0.06566666666666666
FineAggregateComponent 0.065666666666666

Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0.0,24.26539
1.0,31.9541
2.0,38.53354
3.0,44.37008
4.0,44.69272


Mode
=== Target Value Counts ===
Model Run Time: 9.10
Model=cat_best_params
{}
fold: 1, Score: 9.193253728717401, Run Time: 1.24
fold: 2, Score: 8.385788868484553, Run Time: 1.13
fold: 3, Score: 8.803081641149042, Run Time: 1.10
fold: 4, Score: 8.570620154145839, Run Time: 1.08
fold: 5, Score: 8.550440692758883, Run Time: 1.12
fold: 6, Score: 9.369728449769774, Run Time: 1.10
fold: 7, Score: 9.397783224129194, Run Time: 1.05
fold: 8, Score: 8.707338495300155, Run Time: 1.16
fold: 9, Score: 9.031551112788286, Run Time: 1.13
fold: 10, Score: 8.959161727674793, Run Time: 1.16
Scores -> Adjusted: 8.56269726 , mean: 8.89687481, std: 0.33417755

=== Model Feature Importance ===
AgeInDays 0.4135183329818248
Age_Water 0.3520667245138842
CementComponent 0.0812621831334086
SuperplasticizerComponent 0.04812414083821518
Aggregate_Cement 0.017985376686319453
WaterComponent 0.013786896332844072
Slag_Cement 0.013076343286992443
FineAggregateComponent 0.012840054869378772
CoarseAggregateComponent 0.01

Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0.0,22.25971
1.0,33.11788
2.0,38.44523
3.0,46.53688
4.0,43.46987


Mode
=== Target Value Counts ===
Model Run Time: 11.42
Model=xgb3
{}
fold: 1, Score: 9.934738608576497, Run Time: 41.40
fold: 2, Score: 9.431986164306261, Run Time: 40.70
fold: 3, Score: 9.904680393675099, Run Time: 41.82
fold: 4, Score: 9.754065631665057, Run Time: 42.41
fold: 5, Score: 9.283172465733118, Run Time: 41.06
fold: 6, Score: 9.878436516501148, Run Time: 41.46
fold: 7, Score: 10.174515031465091, Run Time: 41.67
fold: 8, Score: 9.442103311529797, Run Time: 40.73
fold: 9, Score: 9.982139926542576, Run Time: 40.95
fold: 10, Score: 9.777870653561818, Run Time: 41.34
Scores -> Adjusted: 9.48750272 , mean: 9.75637087, std: 0.26886815

=== Model Feature Importance ===
Age_Water 0.15925585
Water_Cement 0.0879827
Ash_Cement 0.07353867
Aggregate_Cement 0.07200009
Plastic_Cement 0.06949442
Aggregate 0.06318603
Coarse_Fine 0.06088707
AgeInDays 0.060317002
Slag_Cement 0.05982811
FineAggregateComponent 0.048746537
CoarseAggregateComponent 0.04825652
SuperplasticizerComponent 0.04712614
W

Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0.0,22.90501
1.0,29.50804
2.0,39.04432
3.0,45.69283
4.0,45.70449


Mode
=== Target Value Counts ===
Model Run Time: 414.33
Model=xgb1
{}
fold: 1, Score: 9.40939760131125, Run Time: 1.43
fold: 2, Score: 9.047081537631728, Run Time: 1.44
fold: 3, Score: 9.276157578296543, Run Time: 1.46
fold: 4, Score: 9.197350950181855, Run Time: 1.43
fold: 5, Score: 8.773080619432912, Run Time: 1.44
fold: 6, Score: 9.770478415607665, Run Time: 1.47
fold: 7, Score: 9.890915386158486, Run Time: 1.44
fold: 8, Score: 8.924111760496908, Run Time: 1.43
fold: 9, Score: 9.402192812514713, Run Time: 1.44
fold: 10, Score: 9.299300286692, Run Time: 1.44
Scores -> Adjusted: 8.96931869 , mean: 9.29900669, std: 0.32968800

=== Model Feature Importance ===
Age_Water 0.46525937
Water_Cement 0.10405575
Aggregate_Cement 0.04517857
FineAggregateComponent 0.04069826
Plastic_Cement 0.040458865
Ash_Cement 0.036044154
Coarse_Fine 0.035633285
SuperplasticizerComponent 0.035194296
Slag_Cement 0.033596754
BlastFurnaceSlag 0.029988864
CoarseAggregateComponent 0.029185941
Aggregate 0.027087746
W

Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,26.64933
1.0,30.11574
2.0,41.29745
3.0,45.0497
4.0,45.37445


Mode
=== Target Value Counts ===
Model Run Time: 14.61
Model=xgb2
{}
fold: 1, Score: 9.94990672954121, Run Time: 36.41
fold: 2, Score: 9.364449841280162, Run Time: 36.60
fold: 3, Score: 9.810266041518739, Run Time: 38.34
fold: 4, Score: 9.649942250518325, Run Time: 37.14
fold: 5, Score: 9.30712855694457, Run Time: 36.40
fold: 6, Score: 9.906879338151919, Run Time: 37.33
fold: 7, Score: 10.240888635031185, Run Time: 38.14
fold: 8, Score: 9.377966684872403, Run Time: 36.88
fold: 9, Score: 9.984081833366286, Run Time: 38.00
fold: 10, Score: 9.82947534114725, Run Time: 38.60
Scores -> Adjusted: 9.44835235 , mean: 9.74209853, std: 0.29374618

=== Model Feature Importance ===
Age_Water 0.19119756
Water_Cement 0.086804666
Ash_Cement 0.08186238
AgeInDays 0.07256522
Plastic_Cement 0.06657799
Aggregate_Cement 0.06578737
Aggregate 0.055844724
Coarse_Fine 0.0532292
SuperplasticizerComponent 0.05056027
Slag_Cement 0.04855275
CoarseAggregateComponent 0.042849574
WaterComponent 0.041906837
FineAggreg

Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,25.49742
1.0,29.6293
2.0,41.21247
3.0,45.55589
4.0,45.70273


Mode
=== Target Value Counts ===
Model Run Time: 374.68
Model=lgbm0
{}
fold: 1, Score: 9.181982433119156, Run Time: 0.83
fold: 2, Score: 8.577795535113337, Run Time: 0.99
fold: 3, Score: 8.940251098620836, Run Time: 1.07
fold: 4, Score: 8.763758013499409, Run Time: 1.11
fold: 5, Score: 8.591279310305096, Run Time: 1.08
fold: 6, Score: 9.458543508816517, Run Time: 0.93
fold: 7, Score: 9.576835494955603, Run Time: 1.21
fold: 8, Score: 8.715341523574672, Run Time: 1.23
fold: 9, Score: 9.247319110185964, Run Time: 1.14
fold: 10, Score: 9.084082721295868, Run Time: 1.01
Scores -> Adjusted: 8.67783297 , mean: 9.01371887, std: 0.33588590

=== Model Feature Importance ===
Age_Water 0.12633333333333333
CoarseAggregateComponent 0.08066666666666666
Water_Cement 0.07733333333333334
Plastic_Cement 0.07666666666666666
Coarse_Fine 0.07666666666666666
Aggregate 0.07633333333333334
WaterComponent 0.07366666666666667
Aggregate_Cement 0.06566666666666666
FineAggregateComponent 0.06566666666666666
Superpl

Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0.0,24.26539
1.0,31.9541
2.0,38.53354
3.0,44.37008
4.0,44.69272


Mode
=== Target Value Counts ===
Model Run Time: 10.75
Model=lgbm1
{}
fold: 1, Score: 9.337218193116673, Run Time: 1.89
fold: 2, Score: 8.69469518117453, Run Time: 1.74
fold: 3, Score: 9.11772246343676, Run Time: 1.91
fold: 4, Score: 8.868081478457862, Run Time: 1.44
fold: 5, Score: 8.864983810888178, Run Time: 1.91
fold: 6, Score: 9.491959095384352, Run Time: 1.58
fold: 7, Score: 9.578024244165483, Run Time: 1.78
fold: 8, Score: 8.927281320808474, Run Time: 1.46
fold: 9, Score: 9.222709318293335, Run Time: 1.43
fold: 10, Score: 9.232971527623208, Run Time: 1.24
Scores -> Adjusted: 8.85728195 , mean: 9.13356466, std: 0.27628271

=== Model Feature Importance ===
Age_Water 0.10325752919483712
Water_Cement 0.09803318992009834
Aggregate_Cement 0.09157959434542103
CoarseAggregateComponent 0.08727719729563614
Aggregate 0.08082360172095882
Plastic_Cement 0.07098955132145052
WaterComponent 0.07068223724646588
Coarse_Fine 0.06760909649661954
FineAggregateComponent 0.059311616472034416
CementCom

Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,23.25874
1.0,32.23187
2.0,41.62932
3.0,42.44846
4.0,46.50088


Mode
=== Target Value Counts ===
Model Run Time: 16.54
Model=lgbm2
{}
fold: 1, Score: 9.154923289057276, Run Time: 0.44
fold: 2, Score: 8.383898821605783, Run Time: 0.50
fold: 3, Score: 8.78752600348418, Run Time: 0.46
fold: 4, Score: 8.588862087437908, Run Time: 0.46
fold: 5, Score: 8.53898084590378, Run Time: 0.51
fold: 6, Score: 9.422575007122132, Run Time: 0.48
fold: 7, Score: 9.400142394672613, Run Time: 0.58
fold: 8, Score: 8.719114279158307, Run Time: 0.48
fold: 9, Score: 9.052765729351712, Run Time: 0.42
fold: 10, Score: 9.096512864576733, Run Time: 0.42
Scores -> Adjusted: 8.57001185 , mean: 8.91453013, std: 0.34451828

=== Model Feature Importance ===
Age_Water 0.197
Water_Cement 0.097
Aggregate_Cement 0.093
FineAggregateComponent 0.066
CoarseAggregateComponent 0.06
SuperplasticizerComponent 0.055
Slag_Cement 0.052
Aggregate 0.052
BlastFurnaceSlag 0.049
CementComponent 0.047
Coarse_Fine 0.045
AgeInDays 0.044
WaterComponent 0.041
Plastic_Cement 0.039
Ash_Cement 0.034
FlyAshCom

Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,21.11052
1.0,34.68881
2.0,37.85319
3.0,45.04506
4.0,43.49492


Mode
=== Target Value Counts ===
Model Run Time: 4.90
Model=lgbm3
{}
fold: 1, Score: 9.337218193116673, Run Time: 1.43
fold: 2, Score: 8.69469518117453, Run Time: 1.50
fold: 3, Score: 9.11772246343676, Run Time: 1.46
fold: 4, Score: 8.868081478457862, Run Time: 1.64
fold: 5, Score: 8.864983810888178, Run Time: 1.30
fold: 6, Score: 9.491959095384352, Run Time: 1.17
fold: 7, Score: 9.578024244165483, Run Time: 1.46
fold: 8, Score: 8.927281320808474, Run Time: 1.47
fold: 9, Score: 9.222709318293335, Run Time: 1.38
fold: 10, Score: 9.232971527623208, Run Time: 1.38
Scores -> Adjusted: 8.85728195 , mean: 9.13356466, std: 0.27628271

=== Model Feature Importance ===
Age_Water 0.10325752919483712
Water_Cement 0.09803318992009834
Aggregate_Cement 0.09157959434542103
CoarseAggregateComponent 0.08727719729563614
Aggregate 0.08082360172095882
Plastic_Cement 0.07098955132145052
WaterComponent 0.07068223724646588
Coarse_Fine 0.06760909649661954
FineAggregateComponent 0.059311616472034416
CementComp

Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0.0,23.25874
1.0,32.23187
2.0,41.62932
3.0,42.44846
4.0,46.50088


Mode
=== Target Value Counts ===
Model Run Time: 14.35
Model=cat1
{}
fold: 1, Score: 9.132565832915303, Run Time: 3.67
fold: 2, Score: 8.264931678026645, Run Time: 3.83
fold: 3, Score: 8.82137432369698, Run Time: 3.55
fold: 4, Score: 8.517142809840847, Run Time: 3.66
fold: 5, Score: 8.5226684186519, Run Time: 3.66
fold: 6, Score: 9.375892191821285, Run Time: 3.59
fold: 7, Score: 9.402333662977393, Run Time: 3.71
fold: 8, Score: 8.628503707684843, Run Time: 3.51
fold: 9, Score: 9.008765873210793, Run Time: 3.52
fold: 10, Score: 8.925678972064121, Run Time: 3.77
Scores -> Adjusted: 8.49921710 , mean: 8.85998575, std: 0.36076864

=== Model Feature Importance ===
AgeInDays 0.25224883127688114
Age_Water 0.24152081013958798
Water_Cement 0.1012838879103548
WaterComponent 0.0539173302386742
SuperplasticizerComponent 0.05071064481778539
CementComponent 0.036150252190233864
CoarseAggregateComponent 0.03603541219896678
Coarse_Fine 0.030203305036543733
BlastFurnaceSlag 0.029556171068202852
Aggrega

Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,22.0801
1.0,30.96249
2.0,38.41852
3.0,44.89601
4.0,45.30116


Mode
=== Target Value Counts ===
Model Run Time: 36.72
Model=cat2
{}
fold: 1, Score: 9.208558608159482, Run Time: 49.06
fold: 2, Score: 8.261037006424559, Run Time: 48.84
fold: 3, Score: 8.846097742723703, Run Time: 49.24
fold: 4, Score: 8.57169445998444, Run Time: 48.14
fold: 5, Score: 8.595108286721057, Run Time: 47.64
fold: 6, Score: 9.341957102797354, Run Time: 48.92
fold: 7, Score: 9.429762916986228, Run Time: 47.62
fold: 8, Score: 8.678846967303814, Run Time: 48.27
fold: 9, Score: 9.13528761345295, Run Time: 48.19
fold: 10, Score: 9.015225096955092, Run Time: 48.01
Scores -> Adjusted: 8.54780895 , mean: 8.90835758, std: 0.36054863

=== Model Feature Importance ===
Age_Water 0.27840817022195063
AgeInDays 0.18138049655122923
SuperplasticizerComponent 0.06064399101176945
Water_Cement 0.048585031360046445
WaterComponent 0.04634629907434852
CementComponent 0.043480236951683564
CoarseAggregateComponent 0.04247884244542892
Aggregate 0.03972817510138491
FineAggregateComponent 0.039459320

Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,23.21142
1.0,31.05683
2.0,37.37757
3.0,44.17025
4.0,45.10816


Mode
=== Target Value Counts ===
Model Run Time: 484.11
Model=lasso
fold: 1, Score: 11.159901417097469, Run Time: 0.04
fold: 2, Score: 11.100918912177981, Run Time: 0.08
fold: 3, Score: 11.030166267857476, Run Time: 0.08
fold: 4, Score: 10.98772976455165, Run Time: 0.08
fold: 5, Score: 11.61736203757655, Run Time: 0.08
fold: 6, Score: 11.869141909992226, Run Time: 0.08
fold: 7, Score: 11.698864774925925, Run Time: 0.08
fold: 8, Score: 11.39267356481623, Run Time: 0.08
fold: 9, Score: 11.422976701143503, Run Time: 0.08
fold: 10, Score: 11.037689205077474, Run Time: 0.09
Scores -> Adjusted: 11.03244981 , mean: 11.33174246, std: 0.29929265


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,33.29168
1.0,33.77791
2.0,31.75394
3.0,53.22715
4.0,34.92906


Mode
=== Target Value Counts ===
Model Run Time: 1.01
Model=ridge
fold: 1, Score: 10.955789260145119, Run Time: 0.05
fold: 2, Score: 10.603833514901691, Run Time: 0.07
fold: 3, Score: 10.598623842133348, Run Time: 0.07
fold: 4, Score: 10.680772571375913, Run Time: 0.07
fold: 5, Score: 11.310848630858004, Run Time: 0.07
fold: 6, Score: 11.453136983753524, Run Time: 0.08
fold: 7, Score: 11.258910402649901, Run Time: 0.08
fold: 8, Score: 10.84200144942421, Run Time: 0.08
fold: 9, Score: 11.00288280789183, Run Time: 0.08
fold: 10, Score: 10.507295384865683, Run Time: 0.08
Scores -> Adjusted: 10.60567684 , mean: 10.92140948, std: 0.31573265


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,31.49409
1.0,31.76225
2.0,32.17152
3.0,46.80792
4.0,36.63644


Mode
=== Target Value Counts ===
Model Run Time: 0.98
Model=ridge_50
fold: 1, Score: 10.95770220970431, Run Time: 0.04
fold: 2, Score: 10.603096342290872, Run Time: 0.07
fold: 3, Score: 10.596470232285498, Run Time: 0.07
fold: 4, Score: 10.679457378779562, Run Time: 0.07
fold: 5, Score: 11.310778108894485, Run Time: 0.07
fold: 6, Score: 11.45240922193942, Run Time: 0.07
fold: 7, Score: 11.25740223566257, Run Time: 0.08
fold: 8, Score: 10.841752388480673, Run Time: 0.08
fold: 9, Score: 11.001594638992717, Run Time: 0.08
fold: 10, Score: 10.505591700993804, Run Time: 0.09
Scores -> Adjusted: 10.60457170 , mean: 10.92062545, std: 0.31605375


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,31.46102
1.0,31.73833
2.0,32.16742
3.0,46.6194
4.0,36.64137


Mode
=== Target Value Counts ===
Model Run Time: 0.96
n_folds=5, seed=42
Model=xgb_best_params
{}
fold: 1, Score: 9.298464608918064, Run Time: 1.35
fold: 2, Score: 9.322483535997616, Run Time: 1.38
fold: 3, Score: 9.357544527016783, Run Time: 1.35
fold: 4, Score: 9.428735450012516, Run Time: 1.38
fold: 5, Score: 9.549009119684415, Run Time: 1.37
Scores -> Adjusted: 9.30095114 , mean: 9.39124745, std: 0.09029630

=== Model Feature Importance ===
Age_Water 0.392688
Water_Cement 0.07263184
Aggregate_Cement 0.056668703
Ash_Cement 0.056484517
AgeInDays 0.05485874
SuperplasticizerComponent 0.05211665
CoarseAggregateComponent 0.038826674
FineAggregateComponent 0.038667068
Coarse_Fine 0.03560404
Aggregate 0.034851417
Plastic_Cement 0.033543367
Slag_Cement 0.032833382
BlastFurnaceSlag 0.03217292
WaterComponent 0.030027336
FlyAshComponent 0.02167429
CementComponent 0.01635104


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0.0,25.68961
1.0,31.66316
2.0,43.18766
3.0,44.94294
4.0,46.2775


Mode
=== Target Value Counts ===
Model Run Time: 7.02
Model=lgbm_best_params
{}
fold: 1, Score: 9.027413406267273, Run Time: 0.93
fold: 2, Score: 8.947172092686236, Run Time: 1.15
fold: 3, Score: 9.098100671395061, Run Time: 0.86
fold: 4, Score: 9.143622583619049, Run Time: 0.88
fold: 5, Score: 9.173745294253775, Run Time: 0.90
Scores -> Adjusted: 8.99609321 , mean: 9.07801081, std: 0.08191760

=== Model Feature Importance ===
Age_Water 0.138
CoarseAggregateComponent 0.083
Coarse_Fine 0.08066666666666666
FineAggregateComponent 0.07666666666666666
WaterComponent 0.07666666666666666
Aggregate_Cement 0.07233333333333333
Aggregate 0.071
Water_Cement 0.07
Plastic_Cement 0.065
SuperplasticizerComponent 0.057
CementComponent 0.05533333333333333
BlastFurnaceSlag 0.045
Ash_Cement 0.033666666666666664
Slag_Cement 0.03266666666666666
AgeInDays 0.022
FlyAshComponent 0.021


Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0.0,24.17547
1.0,33.07088
2.0,42.07375
3.0,43.70059
4.0,46.09805


Mode
=== Target Value Counts ===
Model Run Time: 4.87
Model=cat_best_params
{}
fold: 1, Score: 8.822272830446046, Run Time: 1.09
fold: 2, Score: 8.712533522180044, Run Time: 1.06
fold: 3, Score: 9.015057100159057, Run Time: 1.01
fold: 4, Score: 9.070212657830329, Run Time: 1.08
fold: 5, Score: 9.019882065724067, Run Time: 1.12
Scores -> Adjusted: 8.79098818 , mean: 8.92799164, std: 0.13700345

=== Model Feature Importance ===
AgeInDays 0.488297761823933
Age_Water 0.26087905358110525
CementComponent 0.07391692719175474
WaterComponent 0.04033672132251152
Aggregate_Cement 0.029997624601646417
SuperplasticizerComponent 0.026423314270593135
Slag_Cement 0.018096563463523858
CoarseAggregateComponent 0.013048249946372307
Water_Cement 0.010491999994975743
FineAggregateComponent 0.009099223073811454
Plastic_Cement 0.00821518101887054
BlastFurnaceSlag 0.00690520468776569
Ash_Cement 0.004696710736860876
Aggregate 0.004300065459049271
Coarse_Fine 0.004089169714069049
FlyAshComponent 0.0012062291131

Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0.0,21.84307
1.0,33.18813
2.0,38.51692
3.0,46.40614
4.0,44.52899


Mode
=== Target Value Counts ===
Model Run Time: 5.52
Model=xgb3
{}
fold: 1, Score: 9.754135684389505, Run Time: 38.45
fold: 2, Score: 9.814084395533023, Run Time: 39.72
fold: 3, Score: 9.63103161867404, Run Time: 38.97
fold: 4, Score: 9.925978507284238, Run Time: 38.14
fold: 5, Score: 10.022047255085528, Run Time: 38.19
Scores -> Adjusted: 9.69403787 , mean: 9.82945549, std: 0.13541762

=== Model Feature Importance ===
Age_Water 0.13834487
Ash_Cement 0.08233582
Water_Cement 0.079182826
Aggregate_Cement 0.072530836
AgeInDays 0.07049194
Plastic_Cement 0.067127794
Slag_Cement 0.06368249
Aggregate 0.06098471
Coarse_Fine 0.059396613
CoarseAggregateComponent 0.04942351
FineAggregateComponent 0.04922583
WaterComponent 0.048671898
SuperplasticizerComponent 0.047418695
FlyAshComponent 0.044787396
BlastFurnaceSlag 0.0367383
CementComponent 0.029656457


Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0.0,25.08919
1.0,29.90452
2.0,41.57548
3.0,45.16287
4.0,46.13198


Mode
=== Target Value Counts ===
Model Run Time: 194.24
Model=xgb1
{}
fold: 1, Score: 9.298464608918064, Run Time: 1.36
fold: 2, Score: 9.322483535997616, Run Time: 1.41
fold: 3, Score: 9.357544527016783, Run Time: 1.36
fold: 4, Score: 9.428735450012516, Run Time: 1.38
fold: 5, Score: 9.549009119684415, Run Time: 1.38
Scores -> Adjusted: 9.30095114 , mean: 9.39124745, std: 0.09029630

=== Model Feature Importance ===
Age_Water 0.392688
Water_Cement 0.07263184
Aggregate_Cement 0.056668703
Ash_Cement 0.056484517
AgeInDays 0.05485874
SuperplasticizerComponent 0.05211665
CoarseAggregateComponent 0.038826674
FineAggregateComponent 0.038667068
Coarse_Fine 0.03560404
Aggregate 0.034851417
Plastic_Cement 0.033543367
Slag_Cement 0.032833382
BlastFurnaceSlag 0.03217292
WaterComponent 0.030027336
FlyAshComponent 0.02167429
CementComponent 0.01635104


Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,25.68961
1.0,31.66316
2.0,43.18766
3.0,44.94294
4.0,46.2775


Mode
=== Target Value Counts ===
Model Run Time: 7.09
Model=xgb2
{}
fold: 1, Score: 9.800223200424858, Run Time: 35.98
fold: 2, Score: 9.748905304011352, Run Time: 38.53
fold: 3, Score: 9.58737673539382, Run Time: 36.69
fold: 4, Score: 9.866913149995959, Run Time: 36.57
fold: 5, Score: 10.018805121456642, Run Time: 35.84
Scores -> Adjusted: 9.66297128 , mean: 9.80444470, std: 0.14147342

=== Model Feature Importance ===
Age_Water 0.18323264
Water_Cement 0.08564866
Ash_Cement 0.08002615
AgeInDays 0.07331764
Plastic_Cement 0.065817155
Aggregate_Cement 0.06407031
Aggregate 0.057710983
Coarse_Fine 0.052978896
Slag_Cement 0.05041072
SuperplasticizerComponent 0.049667995
CoarseAggregateComponent 0.04640172
FlyAshComponent 0.043386664
WaterComponent 0.041786905
FineAggregateComponent 0.04140058
BlastFurnaceSlag 0.035181034
CementComponent 0.028962014


Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,25.58723
1.0,29.89815
2.0,42.64536
3.0,45.80999
4.0,45.89342


Mode
=== Target Value Counts ===
Model Run Time: 184.39
Model=lgbm0
{}
fold: 1, Score: 9.027413406267273, Run Time: 0.96
fold: 2, Score: 8.947172092686236, Run Time: 0.98
fold: 3, Score: 9.098100671395061, Run Time: 0.91
fold: 4, Score: 9.143622583619049, Run Time: 1.04
fold: 5, Score: 9.173745294253775, Run Time: 1.19
Scores -> Adjusted: 8.99609321 , mean: 9.07801081, std: 0.08191760

=== Model Feature Importance ===
Age_Water 0.138
CoarseAggregateComponent 0.083
Coarse_Fine 0.08066666666666666
FineAggregateComponent 0.07666666666666666
WaterComponent 0.07666666666666666
Aggregate_Cement 0.07233333333333333
Aggregate 0.071
Water_Cement 0.07
Plastic_Cement 0.065
SuperplasticizerComponent 0.057
CementComponent 0.05533333333333333
BlastFurnaceSlag 0.045
Ash_Cement 0.033666666666666664
Slag_Cement 0.03266666666666666
AgeInDays 0.022
FlyAshComponent 0.021


Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0.0,24.17547
1.0,33.07088
2.0,42.07375
3.0,43.70059
4.0,46.09805


Mode
=== Target Value Counts ===
Model Run Time: 5.23
Model=lgbm1
{}
fold: 1, Score: 9.129509000309017, Run Time: 1.29
fold: 2, Score: 9.028930714987068, Run Time: 1.17
fold: 3, Score: 9.178780037683234, Run Time: 1.18
fold: 4, Score: 9.243143839827754, Run Time: 1.07
fold: 5, Score: 9.25213533734775, Run Time: 1.03
Scores -> Adjusted: 9.08446061 , mean: 9.16649979, std: 0.08203917

=== Model Feature Importance ===
Age_Water 0.11403197158081706
CoarseAggregateComponent 0.09094138543516873
Coarse_Fine 0.0866785079928952
Aggregate 0.08490230905861457
Water_Cement 0.08490230905861457
Aggregate_Cement 0.07708703374777975
WaterComponent 0.07708703374777975
Plastic_Cement 0.06394316163410302
CementComponent 0.059325044404973354
FineAggregateComponent 0.05719360568383659
SuperplasticizerComponent 0.05364120781527531
Slag_Cement 0.047246891651865006
Ash_Cement 0.03516873889875666
BlastFurnaceSlag 0.030550621669627
FlyAshComponent 0.02024866785079929
AgeInDays 0.01705150976909414


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,24.6399
1.0,32.69157
2.0,43.18471
3.0,42.25463
4.0,46.7994


Mode
=== Target Value Counts ===
Model Run Time: 5.89
Model=lgbm2
{}
fold: 1, Score: 8.81636548946143, Run Time: 0.41
fold: 2, Score: 8.772669606020251, Run Time: 0.42
fold: 3, Score: 9.023824054520349, Run Time: 0.41
fold: 4, Score: 9.030618460760058, Run Time: 0.42
fold: 5, Score: 9.052764673938036, Run Time: 0.40
Scores -> Adjusted: 8.81988667 , mean: 8.93924846, std: 0.11936178

=== Model Feature Importance ===
Age_Water 0.199
Water_Cement 0.103
Aggregate_Cement 0.069
FineAggregateComponent 0.066
CoarseAggregateComponent 0.064
Aggregate 0.06
SuperplasticizerComponent 0.054
Slag_Cement 0.052
AgeInDays 0.05
WaterComponent 0.049
Plastic_Cement 0.046
BlastFurnaceSlag 0.046
Coarse_Fine 0.041
Ash_Cement 0.039
CementComponent 0.034
FlyAshComponent 0.028


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,21.20956
1.0,34.74887
2.0,37.43049
3.0,44.77962
4.0,44.82637


Mode
=== Target Value Counts ===
Model Run Time: 2.21
Model=lgbm3
{}
fold: 1, Score: 9.129509000309017, Run Time: 1.07
fold: 2, Score: 9.028930714987068, Run Time: 1.35
fold: 3, Score: 9.178780037683234, Run Time: 1.32
fold: 4, Score: 9.243143839827754, Run Time: 1.38
fold: 5, Score: 9.25213533734775, Run Time: 1.28
Scores -> Adjusted: 9.08446061 , mean: 9.16649979, std: 0.08203917

=== Model Feature Importance ===
Age_Water 0.11403197158081706
CoarseAggregateComponent 0.09094138543516873
Coarse_Fine 0.0866785079928952
Aggregate 0.08490230905861457
Water_Cement 0.08490230905861457
Aggregate_Cement 0.07708703374777975
WaterComponent 0.07708703374777975
Plastic_Cement 0.06394316163410302
CementComponent 0.059325044404973354
FineAggregateComponent 0.05719360568383659
SuperplasticizerComponent 0.05364120781527531
Slag_Cement 0.047246891651865006
Ash_Cement 0.03516873889875666
BlastFurnaceSlag 0.030550621669627
FlyAshComponent 0.02024866785079929
AgeInDays 0.01705150976909414


Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0.0,24.6399
1.0,32.69157
2.0,43.18471
3.0,42.25463
4.0,46.7994


Mode
=== Target Value Counts ===
Model Run Time: 6.55
Model=cat1
{}
fold: 1, Score: 8.75598934058948, Run Time: 3.40
fold: 2, Score: 8.7618910125595, Run Time: 3.54
fold: 3, Score: 8.976422252527591, Run Time: 3.78
fold: 4, Score: 8.976959352302464, Run Time: 3.50
fold: 5, Score: 9.030930793113809, Run Time: 3.56
Scores -> Adjusted: 8.78320530 , mean: 8.90043855, std: 0.11723325

=== Model Feature Importance ===
Age_Water 0.29950171403294557
AgeInDays 0.22078579333420836
Water_Cement 0.07161161401669147
WaterComponent 0.0547843324729957
SuperplasticizerComponent 0.04383944809925219
CementComponent 0.04007293188850124
CoarseAggregateComponent 0.03656785604049148
Aggregate 0.03399006530402966
Aggregate_Cement 0.03360644939064342
Coarse_Fine 0.032682134571556706
BlastFurnaceSlag 0.027863879906825515
FineAggregateComponent 0.02541833608815371
Plastic_Cement 0.024917088214743664
Slag_Cement 0.0205099887155887
Ash_Cement 0.020481506893860812
FlyAshComponent 0.01336686102951187


Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,23.36917
1.0,31.13019
2.0,36.83854
3.0,45.07084
4.0,45.22495


Mode
=== Target Value Counts ===
Model Run Time: 17.93
Model=cat2
{}
fold: 1, Score: 8.783210254474545, Run Time: 48.46
fold: 2, Score: 8.74632369043508, Run Time: 49.75
fold: 3, Score: 9.055123970478906, Run Time: 49.73
fold: 4, Score: 9.04568859160515, Run Time: 50.44
fold: 5, Score: 9.156131185604584, Run Time: 48.77
Scores -> Adjusted: 8.79497847 , mean: 8.95729554, std: 0.16231707

=== Model Feature Importance ===
Age_Water 0.3363804169210218
AgeInDays 0.1831646539953126
SuperplasticizerComponent 0.04812814269415321
Water_Cement 0.04713235950152005
CementComponent 0.0453282999953229
WaterComponent 0.03962105206204243
Aggregate 0.03539121698551139
Aggregate_Cement 0.03348988134907789
FineAggregateComponent 0.03337684702806721
Plastic_Cement 0.03300661523998395
Coarse_Fine 0.03255300000653317
CoarseAggregateComponent 0.03241057866981525
Slag_Cement 0.02965691298218381
Ash_Cement 0.027913570176555835
BlastFurnaceSlag 0.026899391373389467
FlyAshComponent 0.015547061019509027


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,23.11491
1.0,31.77336
2.0,37.77528
3.0,44.24992
4.0,44.79392


Mode
=== Target Value Counts ===
Model Run Time: 247.31
Model=lasso
fold: 1, Score: 11.115643340722372, Run Time: 0.05
fold: 2, Score: 11.001168870318764, Run Time: 0.08
fold: 3, Score: 11.747670633136062, Run Time: 0.11
fold: 4, Score: 11.555320367018686, Run Time: 0.10
fold: 5, Score: 11.227621805637167, Run Time: 0.10
Scores -> Adjusted: 11.05028017 , mean: 11.32948500, std: 0.27920483


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,33.14186
1.0,33.82038
2.0,31.61445
3.0,53.17541
4.0,35.01962


Mode
=== Target Value Counts ===
Model Run Time: 0.66
Model=ridge
fold: 1, Score: 10.770998025719994, Run Time: 0.05
fold: 2, Score: 10.654217451144538, Run Time: 0.08
fold: 3, Score: 11.39513807254392, Run Time: 0.08
fold: 4, Score: 11.048466221614829, Run Time: 0.08
fold: 5, Score: 10.741750226650721, Run Time: 0.09
Scores -> Adjusted: 10.65124416 , mean: 10.92211400, std: 0.27086984


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,31.22234
1.0,31.45842
2.0,31.82823
3.0,47.00402
4.0,36.99448


Mode
=== Target Value Counts ===
Model Run Time: 0.63
Model=ridge_50
fold: 1, Score: 10.771532659095, Run Time: 0.05
fold: 2, Score: 10.652348889835322, Run Time: 0.08
fold: 3, Score: 11.394785928566236, Run Time: 0.10
fold: 4, Score: 11.047541477895903, Run Time: 0.09
fold: 5, Score: 10.739797290566475, Run Time: 0.10
Scores -> Adjusted: 10.64996925 , mean: 10.92120125, std: 0.27123200


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,31.18599
1.0,31.4294
2.0,31.82143
3.0,46.79551
4.0,37.00087


Mode
=== Target Value Counts ===
Model Run Time: 0.64
CPU times: user 1h 11min 10s, sys: 4min 18s, total: 1h 15min 29s
Wall time: 34min 55s


In [39]:
sample_submission.head(20)

Unnamed: 0,id,Strength,target_xgb_best_params,target_lgbm_best_params,target_cat_best_params,target_xgb3,target_xgb1,target_xgb2,target_lgbm0,target_lgbm1,target_lgbm2,target_lgbm3,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,5407,35.452,45.18263,46.72108,44.97202,42.74757,45.18263,41.83805,46.72108,48.55177,47.49376,48.55177,46.26873,46.33394,33.95512,34.33324,34.34525
1,5408,35.452,18.40923,17.78635,18.09163,19.47861,18.40923,16.6569,17.78635,18.7116,19.12712,18.7116,17.98938,18.70558,28.83874,26.34597,26.34301
2,5409,35.452,28.47217,30.24316,31.4321,31.64135,28.47217,30.82581,30.24316,28.87665,32.33526,28.87665,31.18649,32.31088,28.56092,25.86969,25.87289
3,5410,35.452,46.43829,46.46367,47.09857,45.26033,46.43829,45.41586,46.46367,45.25143,45.53487,45.25143,46.11544,46.09568,38.79214,39.6157,39.63171
4,5411,35.452,13.64133,17.68775,23.86701,15.08529,13.64133,15.10946,17.68775,15.70477,24.86148,15.70477,20.91831,25.65119,30.7729,27.72353,27.72285
5,5412,35.452,32.86052,45.60994,40.44463,35.11235,32.86052,30.23767,45.60994,42.71671,41.06637,42.71671,39.20178,39.67758,34.44334,35.21317,35.21997
6,5413,35.452,24.30875,29.17719,31.1069,26.43482,24.30875,29.34583,29.17719,23.58885,32.08405,23.58885,33.36652,33.24956,29.04961,27.88248,27.89806
7,5414,35.452,20.96133,21.57562,20.67879,20.30683,20.96133,20.54254,21.57562,20.56315,21.5005,20.56315,20.5516,21.78558,30.28427,31.7795,31.77644
8,5415,35.452,41.32449,51.65125,44.92774,41.83418,41.32449,41.01867,51.65125,47.90912,46.80123,47.90912,47.62136,44.10945,36.90388,38.64076,38.59915
9,5416,35.452,37.13491,35.61054,37.58743,30.23344,37.13491,35.06651,35.61054,35.49412,38.80319,35.49412,38.26219,37.64488,31.53369,32.75968,32.78027


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [40]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [41]:
model_lst = ["xgb1", "xgb2", "cat1", "lgbm0", "lgbm1"]

In [42]:
len(model_lst)

5

In [43]:
target_names = [f"target_{model}" for model in model_lst]
target_names

['target_xgb1', 'target_xgb2', 'target_cat1', 'target_lgbm0', 'target_lgbm1']

In [44]:
sample_submission[TARGET] = sample_submission[target_names].sum(axis=1) / len(model_lst)

In [45]:
sample_submission[[ID, TARGET]].to_csv("submission_models_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,17.4574
3598,9005,39.38163
3599,9006,17.7437
3600,9007,30.57971
3601,9008,29.21952
3602,9009,43.10089
3603,9010,30.96965
3604,9011,21.48313


In [46]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 3 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

# sample_submission[TARGET] = sample_submission[TARGET].astype(int)

In [47]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,17.74171
3598,9005,39.27365
3599,9006,17.58862
3600,9007,30.07489
3601,9008,30.93636
3602,9009,42.18046
3603,9010,31.06833
3604,9011,20.63296


In [48]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime,n_estimators,n_folds,comments
12,lasso,11.33174,0.29929,1.00525,2000,10,
27,lasso,11.32949,0.2792,0.66441,2000,5,
28,ridge,10.92211,0.27087,0.62808,2000,5,
13,ridge,10.92141,0.31573,0.9779,2000,10,
29,ridge_50,10.9212,0.27123,0.64093,2000,5,
14,ridge_50,10.92063,0.31605,0.95766,2000,10,
18,xgb3,9.82946,0.13542,194.23841,2000,5,
20,xgb2,9.80444,0.14147,184.39099,2000,5,
3,xgb3,9.75637,0.26887,414.33012,2000,10,
5,xgb2,9.7421,0.29375,374.68482,2000,10,


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [49]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [50]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [51]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
    id  pred_cat1
0  0.0   23.36917
1  1.0   31.13019
2  2.0   36.83854
3  3.0   45.07084
4  4.0   45.22495
Processing train_pred_cat2, train_pred_cat2.csv
    id  pred_cat2
0  0.0   23.11491
1  1.0   31.77336
2  2.0   37.77528
3  3.0   44.24992
4  4.0   44.79392
Processing train_pred_lgbm1, train_pred_lgbm1.csv
    id  pred_lgbm1
0  0.0    24.63990
1  1.0    32.69157
2  2.0    43.18471
3  3.0    42.25463
4  4.0    46.79940
Processing train_pred_lgbm2, train_pred_lgbm2.csv
    id  pred_lgbm2
0  0.0    21.20956
1  1.0    34.74887
2  2.0    37.43049
3  3.0    44.77962
4  4.0    44.82637
Processing train_pred_xgb1, train_pred_xgb1.csv
    id  pred_xgb1
0  0.0   25.68961
1  1.0   31.66316
2  2.0   43.18766
3  3.0   44.94294
4  4.0   46.27750
submission_cat1, submission_cat1.csv
     id  Strength
0  5407  46.26873
1  5408  17.98938
2  5409  31.18649
3  5410  46.11544
4  5411  20.91831
submission_cat2, submission_cat2.csv
     id  Strength
0  5

In [52]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,23.36917,23.11491,24.6399,21.20956,25.68961
1,31.13019,31.77336,32.69157,34.74887,31.66316
2,36.83854,37.77528,43.18471,37.43049,43.18766
3,45.07084,44.24992,42.25463,44.77962,44.94294
4,45.22495,44.79392,46.7994,44.82637,46.2775


In [53]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,46.26873,46.33394,48.55177,47.49376,45.18263
1,17.98938,18.70558,18.7116,19.12712,18.40922
2,31.18649,32.31088,28.87665,32.33526,28.47217
3,46.11544,46.09568,45.25143,45.53487,46.43829
4,20.91831,25.65119,15.70477,24.86148,13.64133


In [54]:
type(preds_df)

pandas.core.frame.DataFrame

In [55]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [56]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [57]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,23.36917,23.11491,24.6399,21.20956,25.68961
1,31.13019,31.77336,32.69157,34.74887,31.66316
2,36.83854,37.77528,43.18471,37.43049,43.18766
3,45.07084,44.24992,42.25463,44.77962,44.94294
4,45.22495,44.79392,46.7994,44.82637,46.2775


In [58]:
# preds_df[useful_features].head()

In [59]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [60]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [61]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime,n_estimators,n_folds,comments
12,lasso,11.33,0.3,1.01,2000,10,
27,lasso,11.33,0.28,0.66,2000,5,
28,ridge,10.92,0.27,0.63,2000,5,
13,ridge,10.92,0.32,0.98,2000,10,
29,ridge_50,10.92,0.27,0.64,2000,5,
14,ridge_50,10.92,0.32,0.96,2000,10,
18,xgb3,9.83,0.14,194.24,2000,5,
20,xgb2,9.8,0.14,184.39,2000,5,
3,xgb3,9.76,0.27,414.33,2000,10,
5,xgb2,9.74,0.29,374.68,2000,10,
