<a href="https://www.kaggle.com/code/mmellinger66/s3e9-concrete-strength-models?scriptVersionId=121598281" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 9 - Concrete Strength Models</h1>
</div>

## Problem Type

Regression

## Evaluation Metric

$$RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y_i})^2}$$

```python
score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
```

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="Strength"
ID="id"

# Optuna
objective_direction = "minimize"  # minimize, maximize

In [3]:
class Config:
    path:str = "../input/playground-series-s3e9/"
    load_original_data:bool = True # Some Competitions use synthetic data, based on real data
    original_data_path:str = "../input/predict-concrete-strength/ConcreteStrengthData.csv"
    gpu:bool = False
    optimize:bool = False
    n_optuna_trials:int = 30 # 5, 10, 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 100  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5
        

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    """Deprecated: Use summary_statistics()"""
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    if Config.gpu:
        lgbm_dict["device"] = "gpu"
        lgbm_dict["boosting_type"] = "gbdt"
        lgbm_dict["gpu_platform_id"] = 0
        lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    if Config.gpu:
        params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
#         "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),

        "eval_metric": "rmse",  # auc, rmse, mae
        "objective": "reg:squarederror", # Normal Distribution
#         "objective": "reg:gamma", # Gamma Distribution

        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["hist"]
        ),  # hist, gpu_hist
#         "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    #  model = CatBoostClassifier(**cb_params)
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=5407, Columns=10
 test data : Rows=3605, Columns=9
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19



[1m[91m=== Train Column Names ===[0m



Index(['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent',
       'WaterComponent', 'SuperplasticizerComponent',
       'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays',
       'Strength'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

AgeInDays                    2.74687
SuperplasticizerComponent    1.41169
FlyAshComponent              1.30469
BlastFurnaceSlag             1.12120
Strength                     0.38073
CementComponent              0.34128
id                           0.00000
CoarseAggregateComponent    -0.08145
WaterComponent              -0.21528
FineAggregateComponent      -0.44738
dtype: float64
CPU times: user 36.2 ms, sys: 8.94 ms, total: 45.2 ms
Wall time: 75.5 ms


In [8]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19


In [9]:
def load_original_data(path:str) -> pd.DataFrame:
#     original = pd.read_csv(path, index_col=[0])
    original = pd.read_csv(path)

    original = original.reset_index()
    original['id'] = original['index'] + 100000
    original = original.drop(columns = ['index'])
    original = original.rename(columns = {'CementComponent ':'CementComponent'})
    original.set_index('id', inplace=True)
#     original = original[-original.depth.isna()]
    print(f"Shape={original.shape}")
    return original
#     original.head()

if Config.load_original_data:    
    original = load_original_data(Config.original_data_path)
    display(original.head())

Shape=(1030, 9)


Unnamed: 0_level_0,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100000,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
100001,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
100002,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
100003,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
100004,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [10]:
if Config.load_original_data:
    train['is_original']    = 0
    test['is_original']     = 0
    original['is_original'] = 1
#     combined = pd.concat([train, original], ignore_index=True) #.drop_duplicates()
    combined = pd.concat([train, original])

    train = combined
#     combined.head()
    print(f"Shape={combined.shape}")


Shape=(6437, 11)


In [11]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
CementComponent,6437.0,296.29,105.57,102.0,212.5,295.8,374.0,540.0,11145.39,0.37,-0.55
BlastFurnaceSlag,6437.0,61.06,84.06,0.0,0.0,0.0,129.9,359.4,7066.49,1.06,-0.11
FlyAshComponent,6437.0,35.44,56.8,0.0,0.0,0.0,94.0,200.1,3226.28,1.16,-0.28
WaterComponent,6437.0,184.51,19.04,121.8,173.0,186.0,192.0,247.0,362.59,-0.17,0.67
SuperplasticizerComponent,6437.0,4.44,5.79,0.0,0.0,0.0,8.7,32.2,33.51,1.3,1.92
CoarseAggregateComponent,6437.0,988.95,77.56,801.0,938.0,975.6,1047.0,1145.0,6014.85,-0.08,-0.56
FineAggregateComponent,6437.0,771.6,78.96,594.0,734.3,781.0,821.0,992.6,6234.31,-0.41,-0.02
AgeInDays,6437.0,50.78,68.99,1.0,7.0,28.0,56.0,365.0,4759.62,2.82,8.76
Strength,6437.0,35.51,16.45,2.33,23.69,33.96,45.85,82.6,270.61,0.39,-0.35
is_original,6437.0,0.16,0.37,0.0,0.0,0.0,0.0,1.0,0.13,1.86,1.44


## Outlier Detection

In [12]:
# https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
    
def iqr(data:pd.DataFrame, var:str):# outliers detecion .
    q1 = np.quantile(data[var], 0.25)
    q3 = np.quantile(data[var], 0.75)
    diff = q3 - q1
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    return data[(data[var] < lower_t) | (data[var] > upper_t)]

# iqr(train, "squareMeters")

In [13]:
# # https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy

# def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
#     outlier_percents = {}
#     for column in data.columns:
#         if data[column].dtype != object:
#             q1 = np.quantile(data[column], 0.25)
#             q3 = np.quantile(data[column], 0.75)
#             iqr = q3 - q1
#             upper_bound = q3 + (1.5 * iqr)
#             lower_bound = q1 - (1.5 * iqr)
#             outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
#             outlier_percentage = len(outliers) / len(data[column]) * 100
#             outlier_percents[column] = outlier_percentage
#             outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
#     return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

# detect_outliers(train)


In [14]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy
    
def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(test)

Unnamed: 0,Outlier_percentage
FineAggregateComponent,8.54369
WaterComponent,8.2663
AgeInDays,7.93343
SuperplasticizerComponent,1.47018
BlastFurnaceSlag,0.41609
id,0.0
CementComponent,0.0
FlyAshComponent,0.0
CoarseAggregateComponent,0.0
is_original,0.0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

## Handle Outliers
- https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
- https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

In [15]:
# features_with_outliers = []

In [16]:
# https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

def remove_outliers(df:pd.DataFrame) -> pd.DataFrame:
    for c in features_with_outliers:
        if c == 'garage':
            first_percentile = df[c].quantile(0.001)
            df = df[df[c] > first_percentile]

        ninety_ninth_percentile = df[c].quantile(0.999)
        df = df[df[c] < ninety_ninth_percentile]
        #df_t = df_t[(df_t[c] > first_percentile) & (df_t[c] < ninety_ninth_percentile)]
    return df


In [17]:
# print(f'Before: {len(train)}')
# train = remove_outliers(train)
# print(f'After: {len(train)}')

In [18]:
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength,is_original
0,0.0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38,0
1,1.0,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52,0
2,2.0,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96,0
3,3.0,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05,0
4,4.0,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19,0
5,5.0,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43,0
6,6.0,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1,0
7,7.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94,0
8,8.0,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14,0
9,9.0,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94,0


In [19]:
train = train.reset_index(drop=True).copy()
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength,is_original
0,0.0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38,0
1,1.0,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52,0
2,2.0,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96,0
3,3.0,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05,0
4,4.0,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19,0
5,5.0,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43,0
6,6.0,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1,0
7,7.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94,0
8,8.0,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14,0
9,9.0,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94,0


In [20]:
excluded_features = [TARGET, ID, "fold"]

In [21]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength', 'is_original']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['CementComponent',
 'BlastFurnaceSlag',
 'FlyAshComponent',
 'WaterComponent',
 'SuperplasticizerComponent',
 'CoarseAggregateComponent',
 'FineAggregateComponent',
 'AgeInDays',
 'is_original']

In [22]:
train, test = label_encoder(train, test, cat_features)
# train = pd.get_dummies(train,columns=[]) # Will remove original feature names
# test = pd.get_dummies(test,columns=[])

In [23]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength,is_original
0,0.0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38,0
1,1.0,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52,0
2,2.0,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96,0
3,3.0,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05,0
4,4.0,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19,0


In [24]:
# cont_features, cat_features = feature_distribution_types(train, display=True)
# show_cardinality(train, cat_features)

# cont_features = [feature for feature in cont_features if feature not in excluded_features]
# cat_features = [feature for feature in cat_features if feature not in excluded_features]

# FEATURES = cont_features + cat_features
# FEATURES

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [25]:
%%time

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best XGB trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best LGBM trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
# best_cb_params = {}
best_cb_params = {'learning_rate': 0.45743264601999495,
                  'l2_leaf_reg': 41.338946049390074,
                  'bagging_temperature': 0.3472567739474319,
                  'random_strength': 1.7332249677756242, 
                  'depth': 1,
                  'min_data_in_leaf': 6}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best Cat trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 9.06 µs


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [26]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [27]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [28]:
def run_linear_model(train:pd.DataFrame, test:pd.DataFrame, model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(train:pd.DataFrame, test:pd.DataFrame, model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [29]:
%%time

def run_models4features(train:pd.DataFrame, test:pd.DataFrame, model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(train, test, model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(train, test, model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time, "n_estimators": Config.N_ESTIMATORS, "n_folds": Config.N_FOLDS, "comments": ""}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 10 µs, sys: 2 µs, total: 12 µs
Wall time: 13.6 µs


In [30]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params3 = {
    "n_estimators": Config.N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}
    
lgbm_params = gpu_ify_lgbm(lgbm_params)

In [31]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror",
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params3 = {
    'n_estimators': Config.N_ESTIMATORS,
    'learning_rate': 0.05,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror'
}

if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [32]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)

In [33]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),
    "xgb3": xgb.XGBRegressor(**xgb_params3),


    "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm0": lgb.LGBMRegressor(),
    "lgbm3": lgb.LGBMRegressor(lgbm_params3),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),
}

In [34]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
        "n_estimators": pd.Series(dtype="int"),
        "n_folds": pd.Series(dtype="int"),
        "comments": pd.Series(dtype="str"),
    }
)



## Tree Models

In [35]:
%%time

# model_lst = ["xgb3","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
# model_lst = = []

def run_tree_models(X_tr, test, n_folds, model_lst, all_cv_scores):
        all_cv_scores = run_models4features(X_tr, test, model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

        all_cv_scores.sort_values(by=["Score"], ascending=False)
        return all_cv_scores

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs


## Linear Models

In [36]:
def run_linear_models(X_tr, test, n_folds, model_lst, all_cv_scores):
    for training in [train, train]:

    #     all_cv_scores = run_models4features(train, test, model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    
        all_cv_scores = run_models4features(X_tr, test, model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
        return all_cv_scores  

In [37]:
%%time


tree_model_lst = ["xgb_best_params", "lgbm_best_params", "cat_best_params","xgb3", "xgb1", "xgb2", "lgbm0", "lgbm1", "lgbm2", "lgbm3", "cat1", "cat2"]
linear_model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
linear_model_lst = ["lasso", "ridge",  "ridge_50"]

Config.N_FOLDS = 10

for training in [train, train]:
    X_tr = create_folds(train, Config.N_FOLDS)
    
    oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)
    oof.head()

    all_cv_scores = run_tree_models(X_tr, test, Config.N_FOLDS, tree_model_lst, all_cv_scores)
    all_cv_scores = run_linear_models(X_tr, test, Config.N_FOLDS, linear_model_lst, all_cv_scores)
    Config.N_FOLDS = 5



n_folds=10, seed=42
Model=xgb_best_params
{}
fold: 1, Score: 9.236125352930578, Run Time: 0.58
fold: 2, Score: 8.703685671548296, Run Time: 0.55
fold: 3, Score: 8.790395913894132, Run Time: 0.55
fold: 4, Score: 8.73717150128406, Run Time: 0.53
fold: 5, Score: 8.644312141637625, Run Time: 0.54
fold: 6, Score: 8.939861057056403, Run Time: 0.54
fold: 7, Score: 9.326767193338146, Run Time: 0.55
fold: 8, Score: 8.745134027223008, Run Time: 0.53
fold: 9, Score: 8.898621553507043, Run Time: 0.53
fold: 10, Score: 8.725353428468363, Run Time: 0.53
Scores -> Adjusted: 8.65400568 , mean: 8.87474278, std: 0.22073710

=== Model Feature Importance ===
AgeInDays 0.54389334
SuperplasticizerComponent 0.10120986
CementComponent 0.06652754
WaterComponent 0.06457985
is_original 0.048552986
FineAggregateComponent 0.04808326
FlyAshComponent 0.044471428
CoarseAggregateComponent 0.042678196
BlastFurnaceSlag 0.04000359


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0.0,24.19326
1.0,32.04249
2.0,40.09577
3.0,42.7277
4.0,46.48205


Mode
=== Target Value Counts ===
Model Run Time: 5.58
Model=lgbm_best_params
{}
fold: 1, Score: 9.055783731666184, Run Time: 0.77
fold: 2, Score: 8.12570301676443, Run Time: 0.79
fold: 3, Score: 8.41910621068709, Run Time: 0.92
fold: 4, Score: 8.280065603579596, Run Time: 0.78
fold: 5, Score: 8.376722433105561, Run Time: 0.70
fold: 6, Score: 8.875543922011909, Run Time: 0.93
fold: 7, Score: 8.981318038612422, Run Time: 0.90
fold: 8, Score: 8.606421036424909, Run Time: 0.74
fold: 9, Score: 8.68732967779213, Run Time: 0.72
fold: 10, Score: 8.516035214633478, Run Time: 0.78
Scores -> Adjusted: 8.30032440 , mean: 8.59240289, std: 0.29207849

=== Model Feature Importance ===
FineAggregateComponent 0.14966666666666667
CementComponent 0.14733333333333334
CoarseAggregateComponent 0.14466666666666667
WaterComponent 0.14266666666666666
BlastFurnaceSlag 0.10333333333333333
AgeInDays 0.10266666666666667
SuperplasticizerComponent 0.101
FlyAshComponent 0.056
is_original 0.05266666666666667


Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0.0,22.92241
1.0,34.31604
2.0,43.05568
3.0,43.02717
4.0,46.00049


Mode
=== Target Value Counts ===
Model Run Time: 8.16
Model=cat_best_params
{}
fold: 1, Score: 9.233453715297115, Run Time: 0.82
fold: 2, Score: 8.525399558143182, Run Time: 0.73
fold: 3, Score: 8.79768952131788, Run Time: 0.70
fold: 4, Score: 8.562201268087538, Run Time: 0.71
fold: 5, Score: 8.513408344424317, Run Time: 0.67
fold: 6, Score: 9.359479399632571, Run Time: 0.73
fold: 7, Score: 9.373405878030198, Run Time: 0.69
fold: 8, Score: 8.716261711224668, Run Time: 0.72
fold: 9, Score: 9.034441047007501, Run Time: 0.70
fold: 10, Score: 9.054186637776569, Run Time: 0.69
Scores -> Adjusted: 8.59551176 , mean: 8.91699271, std: 0.32148095

=== Model Feature Importance ===
AgeInDays 0.7139548017733675
CementComponent 0.11409509079489567
SuperplasticizerComponent 0.0640124568625145
WaterComponent 0.05121433729633509
BlastFurnaceSlag 0.024374411874988677
FineAggregateComponent 0.01634693956367337
CoarseAggregateComponent 0.010879005080865535
FlyAshComponent 0.004925938713044737
is_original

Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0.0,22.57442
1.0,32.96456
2.0,39.29092
3.0,46.93117
4.0,43.40729


Mode
=== Target Value Counts ===
Model Run Time: 7.30
Model=xgb3
{}
fold: 1, Score: 9.115072100858512, Run Time: 1.20
fold: 2, Score: 8.281834796319096, Run Time: 1.19
fold: 3, Score: 8.623970200319468, Run Time: 1.18
fold: 4, Score: 8.570510627320093, Run Time: 1.18
fold: 5, Score: 8.29798080177781, Run Time: 1.19
fold: 6, Score: 8.957940034807098, Run Time: 1.21
fold: 7, Score: 9.060638760157994, Run Time: 1.18
fold: 8, Score: 8.466117990858633, Run Time: 1.18
fold: 9, Score: 8.916727939030245, Run Time: 1.18
fold: 10, Score: 8.514586782025328, Run Time: 1.18
Scores -> Adjusted: 8.38725146 , mean: 8.68053800, std: 0.29328655

=== Model Feature Importance ===
AgeInDays 0.5902954
SuperplasticizerComponent 0.09135971
WaterComponent 0.05379744
CementComponent 0.049766593
FineAggregateComponent 0.04716308
FlyAshComponent 0.044641245
CoarseAggregateComponent 0.043902498
is_original 0.039931543
BlastFurnaceSlag 0.039142568


Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0.0,25.28448
1.0,34.14724
2.0,39.66658
3.0,42.89977
4.0,46.05143


Mode
=== Target Value Counts ===
Model Run Time: 12.00
Model=xgb1
{}
fold: 1, Score: 9.236125352930578, Run Time: 0.74
fold: 2, Score: 8.703685671548296, Run Time: 0.74
fold: 3, Score: 8.790395913894132, Run Time: 0.74
fold: 4, Score: 8.73717150128406, Run Time: 0.75
fold: 5, Score: 8.644312141637625, Run Time: 0.76
fold: 6, Score: 8.939861057056403, Run Time: 0.76
fold: 7, Score: 9.326767193338146, Run Time: 0.76
fold: 8, Score: 8.745134027223008, Run Time: 0.77
fold: 9, Score: 8.898621553507043, Run Time: 0.77
fold: 10, Score: 8.725353428468363, Run Time: 0.76
Scores -> Adjusted: 8.65400568 , mean: 8.87474278, std: 0.22073710

=== Model Feature Importance ===
AgeInDays 0.54389334
SuperplasticizerComponent 0.10120986
CementComponent 0.06652754
WaterComponent 0.06457985
is_original 0.048552986
FineAggregateComponent 0.04808326
FlyAshComponent 0.044471428
CoarseAggregateComponent 0.042678196
BlastFurnaceSlag 0.04000359


Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,24.19326
1.0,32.04249
2.0,40.09577
3.0,42.7277
4.0,46.48205


Mode
=== Target Value Counts ===
Model Run Time: 7.68
Model=xgb2
{}
fold: 1, Score: 9.082725963888702, Run Time: 1.21
fold: 2, Score: 8.320198714925636, Run Time: 1.07
fold: 3, Score: 8.621016952265864, Run Time: 1.04
fold: 4, Score: 8.513195698305687, Run Time: 1.06
fold: 5, Score: 8.278828196821744, Run Time: 1.04
fold: 6, Score: 9.022919389594415, Run Time: 1.06
fold: 7, Score: 9.120848483743133, Run Time: 1.05
fold: 8, Score: 8.474450437121591, Run Time: 1.05
fold: 9, Score: 8.828915088113693, Run Time: 1.06
fold: 10, Score: 8.472513008325318, Run Time: 1.08
Scores -> Adjusted: 8.37307986 , mean: 8.67356119, std: 0.30048134

=== Model Feature Importance ===
AgeInDays 0.6973176
SuperplasticizerComponent 0.065859735
WaterComponent 0.041117102
CementComponent 0.03641771
is_original 0.035176545
CoarseAggregateComponent 0.03202776
BlastFurnaceSlag 0.031194871
FineAggregateComponent 0.030968588
FlyAshComponent 0.029920096


Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,24.12436
1.0,33.35629
2.0,38.3877
3.0,42.50555
4.0,45.39285


Mode
=== Target Value Counts ===
Model Run Time: 10.86
Model=lgbm0
{}
fold: 1, Score: 9.055783731666184, Run Time: 0.77
fold: 2, Score: 8.12570301676443, Run Time: 0.80
fold: 3, Score: 8.41910621068709, Run Time: 0.76
fold: 4, Score: 8.280065603579596, Run Time: 0.82
fold: 5, Score: 8.376722433105561, Run Time: 0.83
fold: 6, Score: 8.875543922011909, Run Time: 0.95
fold: 7, Score: 8.981318038612422, Run Time: 0.86
fold: 8, Score: 8.606421036424909, Run Time: 0.76
fold: 9, Score: 8.68732967779213, Run Time: 0.76
fold: 10, Score: 8.516035214633478, Run Time: 0.75
Scores -> Adjusted: 8.30032440 , mean: 8.59240289, std: 0.29207849

=== Model Feature Importance ===
FineAggregateComponent 0.14966666666666667
CementComponent 0.14733333333333334
CoarseAggregateComponent 0.14466666666666667
WaterComponent 0.14266666666666666
BlastFurnaceSlag 0.10333333333333333
AgeInDays 0.10266666666666667
SuperplasticizerComponent 0.101
FlyAshComponent 0.056
is_original 0.05266666666666667


Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0.0,22.92241
1.0,34.31604
2.0,43.05568
3.0,43.02717
4.0,46.00049


Mode
=== Target Value Counts ===
Model Run Time: 8.17
Model=lgbm1
{}
fold: 1, Score: 9.149096182972482, Run Time: 0.72
fold: 2, Score: 8.386915375190465, Run Time: 0.80
fold: 3, Score: 8.7326271354246, Run Time: 0.86
fold: 4, Score: 8.523533015867649, Run Time: 0.68
fold: 5, Score: 8.581593311077551, Run Time: 0.78
fold: 6, Score: 9.060754399783715, Run Time: 0.73
fold: 7, Score: 9.187189331925973, Run Time: 0.85
fold: 8, Score: 8.808269834090497, Run Time: 0.83
fold: 9, Score: 8.904172034143704, Run Time: 0.87
fold: 10, Score: 8.832885886383586, Run Time: 0.74
Scores -> Adjusted: 8.56208597 , mean: 8.81670365, std: 0.25461768

=== Model Feature Importance ===
CoarseAggregateComponent 0.19626615605552897
CementComponent 0.16419339396840593
WaterComponent 0.14648157012924845
FineAggregateComponent 0.14073719483006222
SuperplasticizerComponent 0.11680229775011967
BlastFurnaceSlag 0.09478219243657253
AgeInDays 0.0852082336045955
FlyAshComponent 0.04786979415988511
is_original 0.0076591670

Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,23.24892
1.0,32.43066
2.0,39.92815
3.0,43.543
4.0,45.65339


Mode
=== Target Value Counts ===
Model Run Time: 7.98
Model=lgbm2
{}
fold: 1, Score: 9.16817995985692, Run Time: 0.36
fold: 2, Score: 8.40666759997171, Run Time: 0.35
fold: 3, Score: 8.701669366084758, Run Time: 0.35
fold: 4, Score: 8.580113637331726, Run Time: 0.37
fold: 5, Score: 8.586103436072968, Run Time: 0.36
fold: 6, Score: 9.350413237916252, Run Time: 0.36
fold: 7, Score: 9.270965198898608, Run Time: 0.34
fold: 8, Score: 8.749308044524904, Run Time: 0.35
fold: 9, Score: 8.958358604702708, Run Time: 0.36
fold: 10, Score: 9.00700636896053, Run Time: 0.38
Scores -> Adjusted: 8.57314119 , mean: 8.87787855, std: 0.30473736

=== Model Feature Importance ===
AgeInDays 0.176
WaterComponent 0.15
CementComponent 0.149
CoarseAggregateComponent 0.112
FineAggregateComponent 0.102
SuperplasticizerComponent 0.102
FlyAshComponent 0.089
BlastFurnaceSlag 0.076
is_original 0.044


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,22.64701
1.0,33.45124
2.0,35.71802
3.0,44.7279
4.0,42.36713


Mode
=== Target Value Counts ===
Model Run Time: 3.69
Model=lgbm3
{}
fold: 1, Score: 9.149096182972482, Run Time: 0.88
fold: 2, Score: 8.386915375190465, Run Time: 1.19
fold: 3, Score: 8.7326271354246, Run Time: 0.83
fold: 4, Score: 8.523533015867649, Run Time: 0.76
fold: 5, Score: 8.581593311077551, Run Time: 0.86
fold: 6, Score: 9.060754399783715, Run Time: 0.77
fold: 7, Score: 9.187189331925973, Run Time: 0.85
fold: 8, Score: 8.808269834090497, Run Time: 0.82
fold: 9, Score: 8.904172034143704, Run Time: 0.88
fold: 10, Score: 8.832885886383586, Run Time: 0.80
Scores -> Adjusted: 8.56208597 , mean: 8.81670365, std: 0.25461768

=== Model Feature Importance ===
CoarseAggregateComponent 0.19626615605552897
CementComponent 0.16419339396840593
WaterComponent 0.14648157012924845
FineAggregateComponent 0.14073719483006222
SuperplasticizerComponent 0.11680229775011967
BlastFurnaceSlag 0.09478219243657253
AgeInDays 0.0852082336045955
FlyAshComponent 0.04786979415988511
is_original 0.0076591670

Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0.0,23.24892
1.0,32.43066
2.0,39.92815
3.0,43.543
4.0,45.65339


Mode
=== Target Value Counts ===
Model Run Time: 8.79
Model=cat1
{}
fold: 1, Score: 8.842385316986515, Run Time: 1.83
fold: 2, Score: 7.9850569512126235, Run Time: 1.76
fold: 3, Score: 8.455137852119595, Run Time: 1.77
fold: 4, Score: 8.22780940403716, Run Time: 1.78
fold: 5, Score: 8.210470434922678, Run Time: 1.69
fold: 6, Score: 8.749027982444677, Run Time: 1.67
fold: 7, Score: 8.839097959786251, Run Time: 1.73
fold: 8, Score: 8.379434844390742, Run Time: 1.84
fold: 9, Score: 8.639260675573494, Run Time: 1.78
fold: 10, Score: 8.521292199445387, Run Time: 1.80
Scores -> Adjusted: 8.21077584 , mean: 8.48489736, std: 0.27412152

=== Model Feature Importance ===
AgeInDays 0.4659912684569474
CementComponent 0.1450925884244876
WaterComponent 0.08228424883304461
SuperplasticizerComponent 0.06569281289389269
FineAggregateComponent 0.0556882130509553
is_original 0.055160898704424415
BlastFurnaceSlag 0.05356532705306306
CoarseAggregateComponent 0.047055417013662205
FlyAshComponent 0.029469225

Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,20.50967
1.0,34.97334
2.0,38.34672
3.0,44.36916
4.0,44.46599


Mode
=== Target Value Counts ===
Model Run Time: 17.82
Model=cat2
{}
fold: 1, Score: 8.882004718483344, Run Time: 0.88
fold: 2, Score: 7.977759088419156, Run Time: 0.90
fold: 3, Score: 8.416674977791695, Run Time: 0.89
fold: 4, Score: 8.272077847777137, Run Time: 0.89
fold: 5, Score: 8.278552727957184, Run Time: 0.88
fold: 6, Score: 9.081202663446089, Run Time: 0.90
fold: 7, Score: 8.967409469424815, Run Time: 1.05
fold: 8, Score: 8.536536695994418, Run Time: 0.90
fold: 9, Score: 8.815959292973847, Run Time: 0.87
fold: 10, Score: 8.578699546054292, Run Time: 0.85
Scores -> Adjusted: 8.24526028 , mean: 8.58068770, std: 0.33542742

=== Model Feature Importance ===
AgeInDays 0.5032084746157022
CementComponent 0.12009707901092799
WaterComponent 0.07202595607906485
is_original 0.06236545944307235
SuperplasticizerComponent 0.0623574286520829
BlastFurnaceSlag 0.049131756153854254
FineAggregateComponent 0.04804344794046502
CoarseAggregateComponent 0.047013309389005185
FlyAshComponent 0.0357570

Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,21.2265
1.0,35.53554
2.0,38.21673
3.0,43.40681
4.0,43.63925


Mode
=== Target Value Counts ===
Model Run Time: 9.13
Model=lasso
fold: 1, Score: 11.204515037347154, Run Time: 0.03
fold: 2, Score: 11.217786983001021, Run Time: 0.04
fold: 3, Score: 11.100359495215368, Run Time: 0.05
fold: 4, Score: 11.10732069883224, Run Time: 0.05
fold: 5, Score: 11.69170119923506, Run Time: 0.05
fold: 6, Score: 11.947710850062732, Run Time: 0.05
fold: 7, Score: 11.782637387492358, Run Time: 0.05
fold: 8, Score: 11.531262211198314, Run Time: 0.05
fold: 9, Score: 11.508755075128008, Run Time: 0.05
fold: 10, Score: 11.162851684415475, Run Time: 0.05
Scores -> Adjusted: 11.13271899 , mean: 11.42549006, std: 0.29277107


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,33.93727
1.0,33.63498
2.0,32.0581
3.0,54.75081
4.0,34.67705


Mode
=== Target Value Counts ===
Model Run Time: 0.67
Model=ridge
fold: 1, Score: 10.997892105033113, Run Time: 0.06
fold: 2, Score: 10.869717132593685, Run Time: 0.04
fold: 3, Score: 10.827627026803574, Run Time: 0.05
fold: 4, Score: 10.940017480888818, Run Time: 0.05
fold: 5, Score: 11.45307944778129, Run Time: 0.05
fold: 6, Score: 11.582842537369332, Run Time: 0.04
fold: 7, Score: 11.465743733770681, Run Time: 0.05
fold: 8, Score: 11.182484659869058, Run Time: 0.05
fold: 9, Score: 11.257265503201364, Run Time: 0.05
fold: 10, Score: 10.85309697029356, Run Time: 0.05
Scores -> Adjusted: 10.87300713 , mean: 11.14297666, std: 0.26996953


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,35.24243
1.0,34.90703
2.0,32.1831
3.0,58.32234
4.0,36.45027


Mode
=== Target Value Counts ===
Model Run Time: 0.69
Model=ridge_50
fold: 1, Score: 10.997908414405188, Run Time: 0.04
fold: 2, Score: 10.869648312563903, Run Time: 0.05
fold: 3, Score: 10.82760886302, Run Time: 0.05
fold: 4, Score: 10.939997935200502, Run Time: 0.05
fold: 5, Score: 11.453055147701754, Run Time: 0.04
fold: 6, Score: 11.58279325583396, Run Time: 0.05
fold: 7, Score: 11.465683747008704, Run Time: 0.05
fold: 8, Score: 11.182428493438755, Run Time: 0.05
fold: 9, Score: 11.257222975245227, Run Time: 0.05
fold: 10, Score: 10.853010573137023, Run Time: 0.05
Scores -> Adjusted: 10.87296790 , mean: 11.14293577, std: 0.26996788


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,35.24257
1.0,34.90685
2.0,32.18391
3.0,58.32428
4.0,36.45071


Mode
=== Target Value Counts ===
Model Run Time: 0.68
n_folds=5, seed=42
Model=xgb_best_params
{}
fold: 1, Score: 8.992121124506367, Run Time: 0.71
fold: 2, Score: 8.733414161464442, Run Time: 0.71
fold: 3, Score: 8.956214543374454, Run Time: 0.71
fold: 4, Score: 9.118459456383764, Run Time: 0.72
fold: 5, Score: 9.041821897368887, Run Time: 0.72
Scores -> Adjusted: 8.83891636 , mean: 8.96840624, std: 0.12948987

=== Model Feature Importance ===
AgeInDays 0.5348705
SuperplasticizerComponent 0.09560716
WaterComponent 0.0679491
CementComponent 0.06512399
FineAggregateComponent 0.051458344
is_original 0.04879414
FlyAshComponent 0.045836072
BlastFurnaceSlag 0.045422416
CoarseAggregateComponent 0.04493825


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0.0,22.93364
1.0,32.67008
2.0,41.96727
3.0,40.7112
4.0,46.85235


Mode
=== Target Value Counts ===
Model Run Time: 3.68
Model=lgbm_best_params
{}
fold: 1, Score: 8.6469613848716, Run Time: 0.75
fold: 2, Score: 8.458575302905318, Run Time: 0.75
fold: 3, Score: 8.62103563334192, Run Time: 0.75
fold: 4, Score: 8.81514767626793, Run Time: 0.77
fold: 5, Score: 8.665335619615586, Run Time: 0.81
Scores -> Adjusted: 8.52771708 , mean: 8.64141112, std: 0.11369404

=== Model Feature Importance ===
FineAggregateComponent 0.15566666666666668
CementComponent 0.151
WaterComponent 0.14833333333333334
CoarseAggregateComponent 0.133
AgeInDays 0.103
BlastFurnaceSlag 0.10166666666666667
SuperplasticizerComponent 0.09533333333333334
FlyAshComponent 0.06633333333333333
is_original 0.04566666666666667


Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0.0,23.26141
1.0,33.89003
2.0,40.82895
3.0,42.66877
4.0,47.15585


Mode
=== Target Value Counts ===
Model Run Time: 3.95
Model=cat_best_params
{}
fold: 1, Score: 8.829294552514115, Run Time: 0.68
fold: 2, Score: 8.68781027375643, Run Time: 0.67
fold: 3, Score: 8.942823216886707, Run Time: 0.66
fold: 4, Score: 9.069369968251314, Run Time: 0.64
fold: 5, Score: 9.098976233421462, Run Time: 0.65
Scores -> Adjusted: 8.77278599 , mean: 8.92565485, std: 0.15286886

=== Model Feature Importance ===
AgeInDays 0.7142855442080787
CementComponent 0.1181208099035846
SuperplasticizerComponent 0.060677933628941645
WaterComponent 0.047660093992663015
BlastFurnaceSlag 0.02419024427993616
FineAggregateComponent 0.017551496288429013
CoarseAggregateComponent 0.01263324284488292
FlyAshComponent 0.004749914811015916
is_original 0.00013072004246801632


Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0.0,22.88352
1.0,31.81962
2.0,38.53497
3.0,46.96537
4.0,43.00581


Mode
=== Target Value Counts ===
Model Run Time: 3.42
Model=xgb3
{}
fold: 1, Score: 8.837577945667764, Run Time: 1.10
fold: 2, Score: 8.636069847234287, Run Time: 1.09
fold: 3, Score: 8.73279698480815, Run Time: 1.10
fold: 4, Score: 8.846140630258176, Run Time: 1.09
fold: 5, Score: 8.854638926321533, Run Time: 1.09
Scores -> Adjusted: 8.69636332 , mean: 8.78144487, std: 0.08508155

=== Model Feature Importance ===
AgeInDays 0.61559093
SuperplasticizerComponent 0.07770578
WaterComponent 0.055866014
CementComponent 0.052290294
FineAggregateComponent 0.046200264
CoarseAggregateComponent 0.042677015
FlyAshComponent 0.041050218
is_original 0.035590023
BlastFurnaceSlag 0.03302941


Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0.0,25.13547
1.0,34.39598
2.0,40.5027
3.0,41.39053
4.0,45.74621


Mode
=== Target Value Counts ===
Model Run Time: 5.61
Model=xgb1
{}
fold: 1, Score: 8.992121124506367, Run Time: 0.71
fold: 2, Score: 8.733414161464442, Run Time: 0.70
fold: 3, Score: 8.956214543374454, Run Time: 0.73
fold: 4, Score: 9.118459456383764, Run Time: 0.71
fold: 5, Score: 9.041821897368887, Run Time: 0.71
Scores -> Adjusted: 8.83891636 , mean: 8.96840624, std: 0.12948987

=== Model Feature Importance ===
AgeInDays 0.5348705
SuperplasticizerComponent 0.09560716
WaterComponent 0.0679491
CementComponent 0.06512399
FineAggregateComponent 0.051458344
is_original 0.04879414
FlyAshComponent 0.045836072
BlastFurnaceSlag 0.045422416
CoarseAggregateComponent 0.04493825


Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,22.93364
1.0,32.67008
2.0,41.96727
3.0,40.7112
4.0,46.85235


Mode
=== Target Value Counts ===
Model Run Time: 3.69
Model=xgb2
{}
fold: 1, Score: 8.790288359511713, Run Time: 1.01
fold: 2, Score: 8.631925358031847, Run Time: 1.02
fold: 3, Score: 8.720934793521197, Run Time: 1.02
fold: 4, Score: 8.813794754475888, Run Time: 1.03
fold: 5, Score: 8.801228067506257, Run Time: 1.12
Scores -> Adjusted: 8.68367797 , mean: 8.75163427, std: 0.06795630

=== Model Feature Importance ===
AgeInDays 0.6741591
SuperplasticizerComponent 0.06544539
WaterComponent 0.044892494
CementComponent 0.039817963
CoarseAggregateComponent 0.038065217
is_original 0.036948487
FineAggregateComponent 0.034848586
FlyAshComponent 0.033209722
BlastFurnaceSlag 0.032613065


Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,24.95548
1.0,32.44896
2.0,40.59565
3.0,42.27273
4.0,45.51218


Mode
=== Target Value Counts ===
Model Run Time: 5.33
Model=lgbm0
{}
fold: 1, Score: 8.6469613848716, Run Time: 0.88
fold: 2, Score: 8.458575302905318, Run Time: 0.72
fold: 3, Score: 8.62103563334192, Run Time: 0.78
fold: 4, Score: 8.81514767626793, Run Time: 0.77
fold: 5, Score: 8.665335619615586, Run Time: 0.82
Scores -> Adjusted: 8.52771708 , mean: 8.64141112, std: 0.11369404

=== Model Feature Importance ===
FineAggregateComponent 0.15566666666666668
CementComponent 0.151
WaterComponent 0.14833333333333334
CoarseAggregateComponent 0.133
AgeInDays 0.103
BlastFurnaceSlag 0.10166666666666667
SuperplasticizerComponent 0.09533333333333334
FlyAshComponent 0.06633333333333333
is_original 0.04566666666666667


Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0.0,23.26141
1.0,33.89003
2.0,40.82895
3.0,42.66877
4.0,47.15585


Mode
=== Target Value Counts ===
Model Run Time: 4.10
Model=lgbm1
{}
fold: 1, Score: 8.781139640071684, Run Time: 0.74
fold: 2, Score: 8.709040659877498, Run Time: 0.67
fold: 3, Score: 8.80640952313024, Run Time: 0.72
fold: 4, Score: 8.96679561544107, Run Time: 0.66
fold: 5, Score: 8.987212454352488, Run Time: 0.70
Scores -> Adjusted: 8.74151036 , mean: 8.85011958, std: 0.10860922

=== Model Feature Importance ===
CoarseAggregateComponent 0.23054040654437283
FineAggregateComponent 0.15270203272186417
CementComponent 0.14774417451660882
WaterComponent 0.14377788795240456
SuperplasticizerComponent 0.09519087754090233
BlastFurnaceSlag 0.0842835894893406
AgeInDays 0.08031730292513634
FlyAshComponent 0.05899851264253842
is_original 0.006445215666831929


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,23.09081
1.0,30.3692
2.0,43.27919
3.0,42.85879
4.0,46.98142


Mode
=== Target Value Counts ===
Model Run Time: 3.61
Model=lgbm2
{}
fold: 1, Score: 8.809860521234734, Run Time: 0.32
fold: 2, Score: 8.646333881713847, Run Time: 0.33
fold: 3, Score: 9.008811788025513, Run Time: 0.37
fold: 4, Score: 9.015505359172197, Run Time: 0.36
fold: 5, Score: 8.983053770048388, Run Time: 0.36
Scores -> Adjusted: 8.74829316 , mean: 8.89271306, std: 0.14441991

=== Model Feature Importance ===
AgeInDays 0.179
WaterComponent 0.148
CementComponent 0.141
CoarseAggregateComponent 0.121
FineAggregateComponent 0.108
SuperplasticizerComponent 0.105
FlyAshComponent 0.084
BlastFurnaceSlag 0.071
is_original 0.043


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,22.90256
1.0,33.08388
2.0,36.38186
3.0,44.98889
4.0,42.88206


Mode
=== Target Value Counts ===
Model Run Time: 1.87
Model=lgbm3
{}
fold: 1, Score: 8.781139640071684, Run Time: 0.77
fold: 2, Score: 8.709040659877498, Run Time: 0.67
fold: 3, Score: 8.80640952313024, Run Time: 0.74
fold: 4, Score: 8.96679561544107, Run Time: 0.70
fold: 5, Score: 8.987212454352488, Run Time: 0.74
Scores -> Adjusted: 8.74151036 , mean: 8.85011958, std: 0.10860922

=== Model Feature Importance ===
CoarseAggregateComponent 0.23054040654437283
FineAggregateComponent 0.15270203272186417
CementComponent 0.14774417451660882
WaterComponent 0.14377788795240456
SuperplasticizerComponent 0.09519087754090233
BlastFurnaceSlag 0.0842835894893406
AgeInDays 0.08031730292513634
FlyAshComponent 0.05899851264253842
is_original 0.006445215666831929


Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0.0,23.09081
1.0,30.3692
2.0,43.27919
3.0,42.85879
4.0,46.98142


Mode
=== Target Value Counts ===
Model Run Time: 3.75
Model=cat1
{}
fold: 1, Score: 8.431749298401597, Run Time: 1.71
fold: 2, Score: 8.388387899200259, Run Time: 1.66
fold: 3, Score: 8.61088159648203, Run Time: 1.68
fold: 4, Score: 8.647884863106468, Run Time: 1.67
fold: 5, Score: 8.616519213641785, Run Time: 1.70
Scores -> Adjusted: 8.43210907 , mean: 8.53908457, std: 0.10697550

=== Model Feature Importance ===
AgeInDays 0.5011350696894802
CementComponent 0.14023811194223915
WaterComponent 0.07174849751558048
SuperplasticizerComponent 0.06377808703707621
is_original 0.0548454901720651
BlastFurnaceSlag 0.05274810508637863
FineAggregateComponent 0.048259980049346736
CoarseAggregateComponent 0.04016481254801164
FlyAshComponent 0.027081845959821868


Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,21.56087
1.0,34.4374
2.0,38.23524
3.0,44.26784
4.0,45.42224


Mode
=== Target Value Counts ===
Model Run Time: 8.56
Model=cat2
{}
fold: 1, Score: 8.486452076257459, Run Time: 0.83
fold: 2, Score: 8.398097025948712, Run Time: 0.83
fold: 3, Score: 8.729117006170345, Run Time: 0.88
fold: 4, Score: 8.736941053990376, Run Time: 0.91
fold: 5, Score: 8.75059953339561, Run Time: 0.99
Scores -> Adjusted: 8.47211055 , mean: 8.62024134, std: 0.14813079

=== Model Feature Importance ===
AgeInDays 0.4920662095046869
CementComponent 0.1162907484817497
WaterComponent 0.07683575070505587
is_original 0.06263189700783435
SuperplasticizerComponent 0.05715114341889712
CoarseAggregateComponent 0.05447293703831537
FineAggregateComponent 0.053917382732094135
BlastFurnaceSlag 0.04940941702954806
FlyAshComponent 0.037224514081818566


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,21.78553
1.0,36.22236
2.0,39.11054
3.0,44.02174
4.0,44.64331


Mode
=== Target Value Counts ===
Model Run Time: 4.56
Model=lasso
fold: 1, Score: 11.195269849347316, Run Time: 0.04
fold: 2, Score: 11.095622470716382, Run Time: 0.06
fold: 3, Score: 11.825413657275954, Run Time: 0.05
fold: 4, Score: 11.672098094494755, Run Time: 0.06
fold: 5, Score: 11.33590531548821, Run Time: 0.08
Scores -> Adjusted: 11.14536525 , mean: 11.42486188, std: 0.27949663


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,33.79405
1.0,33.69611
2.0,31.94575
3.0,54.65122
4.0,34.76774


Mode
=== Target Value Counts ===
Model Run Time: 0.49
Model=ridge
fold: 1, Score: 10.924089987588308, Run Time: 0.04
fold: 2, Score: 10.895494126528128, Run Time: 0.06
fold: 3, Score: 11.526104063096483, Run Time: 0.07
fold: 4, Score: 11.3306436121218, Run Time: 0.06
fold: 5, Score: 11.047532527047172, Run Time: 0.08
Scores -> Adjusted: 10.89971862 , mean: 11.14477286, std: 0.24505425


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,35.17172
1.0,35.082
2.0,31.94759
3.0,58.22468
4.0,36.62562


Mode
=== Target Value Counts ===
Model Run Time: 0.51
Model=ridge_50
fold: 1, Score: 10.924059453271678, Run Time: 0.04
fold: 2, Score: 10.895481824132732, Run Time: 0.05
fold: 3, Score: 11.52606366402642, Run Time: 0.05
fold: 4, Score: 11.33057166300521, Run Time: 0.07
fold: 5, Score: 11.047455427963495, Run Time: 0.08
Scores -> Adjusted: 10.89968152 , mean: 11.14472641, std: 0.24504488


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,35.17185
1.0,35.08182
2.0,31.94836
3.0,58.22688
4.0,36.62616


Mode
=== Target Value Counts ===
Model Run Time: 0.48
CPU times: user 4min 9s, sys: 54.5 s, total: 5min 3s
Wall time: 2min 42s


In [38]:
sample_submission.head(20)

Unnamed: 0,id,Strength,target_xgb_best_params,target_lgbm_best_params,target_cat_best_params,target_xgb3,target_xgb1,target_xgb2,target_lgbm0,target_lgbm1,target_lgbm2,target_lgbm3,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,5407,35.452,42.92241,46.22262,44.76576,45.34196,42.92241,44.72165,46.22262,47.79622,46.32879,47.79622,47.22347,46.89835,33.58043,34.80228,34.80263
1,5408,35.452,15.81407,19.50923,19.8836,19.14786,15.81407,20.09818,19.50923,18.35496,20.34535,18.35496,19.44139,19.59299,29.08704,26.47911,26.47779
2,5409,35.452,30.90379,32.49093,30.79068,32.01067,30.90379,31.91684,32.49093,31.18286,32.63635,31.18286,32.3648,33.84536,29.10303,25.1279,25.12697
3,5410,35.452,45.73134,46.34991,45.79378,46.08764,45.73134,46.58258,46.34991,46.69429,44.36107,46.69429,46.98858,46.42902,38.28218,37.72009,37.72052
4,5411,35.452,21.63098,25.66189,25.89851,21.67327,21.63098,20.63199,25.66189,20.96297,26.67288,20.96297,28.10152,28.12529,31.12518,30.32645,30.32574
5,5412,35.452,44.30632,44.74317,40.5168,42.37252,44.30632,44.87197,44.74317,45.1892,41.9858,45.1892,39.89685,39.5019,34.09403,34.57516,34.57514
6,5413,35.452,26.66411,28.56655,29.23823,29.7678,26.66411,30.0085,28.56655,27.99033,33.42085,27.99033,30.83093,33.02627,29.21252,26.20105,26.19749
7,5414,35.452,21.06854,21.61859,23.10988,18.62709,21.06854,20.43243,21.61859,20.87957,22.61033,20.87957,22.06883,21.25458,30.36277,31.68782,31.68781
8,5415,35.452,44.88307,48.99787,45.87218,48.83333,44.88307,43.13742,48.99787,52.16774,45.27036,52.16774,44.28766,43.50256,36.89103,41.79684,41.79798
9,5416,35.452,30.00051,36.15427,36.37141,31.31363,30.00051,34.27808,36.15427,35.37121,38.09412,35.37121,38.20545,37.06937,32.34939,31.1574,31.1565


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [39]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [40]:
model_lst = ["xgb1", "xgb2", "cat1", "lgbm0", "lgbm1"]

In [41]:
len(model_lst)

5

In [42]:
target_names = [f"target_{model}" for model in model_lst]
target_names

['target_xgb1', 'target_xgb2', 'target_cat1', 'target_lgbm0', 'target_lgbm1']

In [43]:
sample_submission[TARGET] = sample_submission[target_names].sum(axis=1) / len(model_lst)

In [44]:
sample_submission[[ID, TARGET]].to_csv("submission_models_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,18.2941
3598,9005,39.15304
3599,9006,16.89599
3600,9007,31.29621
3601,9008,32.9624
3602,9009,41.32826
3603,9010,31.09395
3604,9011,19.6966


In [45]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 3 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

# sample_submission[TARGET] = sample_submission[TARGET].astype(int)

In [46]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,18.16883
3598,9005,39.17904
3599,9006,17.40593
3600,9007,30.6512
3601,9008,32.51728
3602,9009,40.80834
3603,9010,30.51272
3604,9011,19.95918


In [47]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime,n_estimators,n_folds,comments
12,lasso,11.42549,0.29277,0.6713,100,10,
27,lasso,11.42486,0.2795,0.48865,100,5,
28,ridge,11.14477,0.24505,0.51195,100,5,
29,ridge_50,11.14473,0.24504,0.47838,100,5,
13,ridge,11.14298,0.26997,0.69033,100,10,
14,ridge_50,11.14294,0.26997,0.68331,100,10,
19,xgb1,8.96841,0.12949,3.69252,100,5,
15,xgb_best_params,8.96841,0.12949,3.67921,100,5,
17,cat_best_params,8.92565,0.15287,3.41532,100,5,
2,cat_best_params,8.91699,0.32148,7.30316,100,10,


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [48]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [49]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [50]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
    id  pred_cat1
0  0.0   21.56087
1  1.0   34.43740
2  2.0   38.23524
3  3.0   44.26784
4  4.0   45.42224
Processing train_pred_cat2, train_pred_cat2.csv
    id  pred_cat2
0  0.0   21.78553
1  1.0   36.22236
2  2.0   39.11054
3  3.0   44.02174
4  4.0   44.64331
Processing train_pred_lgbm1, train_pred_lgbm1.csv
    id  pred_lgbm1
0  0.0    23.09081
1  1.0    30.36920
2  2.0    43.27919
3  3.0    42.85879
4  4.0    46.98142
Processing train_pred_lgbm2, train_pred_lgbm2.csv
    id  pred_lgbm2
0  0.0    22.90256
1  1.0    33.08388
2  2.0    36.38186
3  3.0    44.98889
4  4.0    42.88206
Processing train_pred_xgb1, train_pred_xgb1.csv
    id  pred_xgb1
0  0.0   22.93364
1  1.0   32.67008
2  2.0   41.96727
3  3.0   40.71120
4  4.0   46.85235
submission_cat1, submission_cat1.csv
     id  Strength
0  5407  47.22347
1  5408  19.44139
2  5409  32.36480
3  5410  46.98858
4  5411  28.10152
submission_cat2, submission_cat2.csv
     id  Strength
0  5

In [51]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,21.56087,21.78553,23.09081,22.90256,22.93364
1,34.4374,36.22236,30.3692,33.08388,32.67008
2,38.23524,39.11054,43.27919,36.38186,41.96727
3,44.26784,44.02174,42.85879,44.98889,40.7112
4,45.42224,44.64331,46.98142,42.88206,46.85235


In [52]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,47.22347,46.89835,47.79622,46.32879,42.92241
1,19.44139,19.59299,18.35496,20.34535,15.81407
2,32.3648,33.84536,31.18286,32.63635,30.90379
3,46.98858,46.42902,46.69429,44.36107,45.73134
4,28.10152,28.12529,20.96297,26.67288,21.63098


In [53]:
type(preds_df)

pandas.core.frame.DataFrame

In [54]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [55]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [56]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,21.56087,21.78553,23.09081,22.90256,22.93364
1,34.4374,36.22236,30.3692,33.08388,32.67008
2,38.23524,39.11054,43.27919,36.38186,41.96727
3,44.26784,44.02174,42.85879,44.98889,40.7112
4,45.42224,44.64331,46.98142,42.88206,46.85235


In [57]:
# preds_df[useful_features].head()

In [58]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [59]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [60]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime,n_estimators,n_folds,comments
12,lasso,11.43,0.29,0.67,100,10,
27,lasso,11.42,0.28,0.49,100,5,
28,ridge,11.14,0.25,0.51,100,5,
29,ridge_50,11.14,0.25,0.48,100,5,
13,ridge,11.14,0.27,0.69,100,10,
14,ridge_50,11.14,0.27,0.68,100,10,
19,xgb1,8.97,0.13,3.69,100,5,
15,xgb_best_params,8.97,0.13,3.68,100,5,
17,cat_best_params,8.93,0.15,3.42,100,5,
2,cat_best_params,8.92,0.32,7.3,100,10,
