<a href="https://www.kaggle.com/code/mmellinger66/s3e9-concrete-strength-models?scriptVersionId=121910475" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 9 - Concrete Strength Models</h1>
</div>

## Problem Type

Regression

## Evaluation Metric

$$RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y_i})^2}$$

```python
score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
```

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="Strength"
ID="id"

# Optuna
objective_direction = "minimize"  # minimize, maximize

In [3]:
class Config:
    path:str = "../input/playground-series-s3e9/"
    load_original_data:bool = True # Some Competitions use synthetic data, based on real data
    original_data_path:str = "../input/predict-concrete-strength/ConcreteStrengthData.csv"
    gpu:bool = False
    optimize:bool = False
    n_optuna_trials:int = 50 # 5, 10, 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = True
    seed:int = 42
    N_ESTIMATORS:int = 200  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5
        

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    """Deprecated: Use summary_statistics()"""
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    if Config.gpu:
        lgbm_dict["device"] = "gpu"
        lgbm_dict["boosting_type"] = "gbdt"
        lgbm_dict["gpu_platform_id"] = 0
        lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    if Config.gpu:
        params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
#         "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),

        "eval_metric": "rmse",  # auc, rmse, mae
        "objective": "reg:squarederror", # Normal Distribution
#         "objective": "reg:gamma", # Gamma Distribution

        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["hist"]
        ),  # hist, gpu_hist
#         "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=500,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    #  model = CatBoostClassifier(**cb_params)
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=5407, Columns=10
 test data : Rows=3605, Columns=9
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19



[1m[91m=== Train Column Names ===[0m



Index(['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent',
       'WaterComponent', 'SuperplasticizerComponent',
       'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays',
       'Strength'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

AgeInDays                    2.74687
SuperplasticizerComponent    1.41169
FlyAshComponent              1.30469
BlastFurnaceSlag             1.12120
Strength                     0.38073
CementComponent              0.34128
id                           0.00000
CoarseAggregateComponent    -0.08145
WaterComponent              -0.21528
FineAggregateComponent      -0.44738
dtype: float64
CPU times: user 42.3 ms, sys: 8.33 ms, total: 50.6 ms
Wall time: 78.4 ms


In [8]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
0,0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38
1,1,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52
2,2,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96
3,3,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05
4,4,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19


In [9]:
def load_original_data(path:str) -> pd.DataFrame:
#     original = pd.read_csv(path, index_col=[0])
    original = pd.read_csv(path)

    original = original.reset_index()
    original['id'] = original['index'] + 100000
    original = original.drop(columns = ['index'])
    original = original.rename(columns = {'CementComponent ':'CementComponent'})
    original.set_index('id', inplace=True)
#     original = original[-original.depth.isna()]
    print(f"Shape={original.shape}")
    return original
#     original.head()

if Config.load_original_data:    
    original = load_original_data(Config.original_data_path)
    display(original.head())

Shape=(1030, 9)


Unnamed: 0_level_0,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100000,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
100001,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
100002,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
100003,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
100004,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [10]:
if Config.load_original_data:
    train['is_original']    = 0
    test['is_original']     = 0
    original['is_original'] = 1
#     combined = pd.concat([train, original], ignore_index=True) #.drop_duplicates()
    combined = pd.concat([train, original])

    train = combined
#     combined.head()
    print(f"Shape={combined.shape}")


Shape=(6437, 11)


In [11]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
CementComponent,6437.0,296.29,105.57,102.0,212.5,295.8,374.0,540.0,11145.39,0.37,-0.55
BlastFurnaceSlag,6437.0,61.06,84.06,0.0,0.0,0.0,129.9,359.4,7066.49,1.06,-0.11
FlyAshComponent,6437.0,35.44,56.8,0.0,0.0,0.0,94.0,200.1,3226.28,1.16,-0.28
WaterComponent,6437.0,184.51,19.04,121.8,173.0,186.0,192.0,247.0,362.59,-0.17,0.67
SuperplasticizerComponent,6437.0,4.44,5.79,0.0,0.0,0.0,8.7,32.2,33.51,1.3,1.92
CoarseAggregateComponent,6437.0,988.95,77.56,801.0,938.0,975.6,1047.0,1145.0,6014.85,-0.08,-0.56
FineAggregateComponent,6437.0,771.6,78.96,594.0,734.3,781.0,821.0,992.6,6234.31,-0.41,-0.02
AgeInDays,6437.0,50.78,68.99,1.0,7.0,28.0,56.0,365.0,4759.62,2.82,8.76
Strength,6437.0,35.51,16.45,2.33,23.69,33.96,45.85,82.6,270.61,0.39,-0.35
is_original,6437.0,0.16,0.37,0.0,0.0,0.0,0.0,1.0,0.13,1.86,1.44


## Outlier Detection

In [12]:
# https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
    
def iqr(data:pd.DataFrame, var:str):# outliers detecion .
    q1 = np.quantile(data[var], 0.25)
    q3 = np.quantile(data[var], 0.75)
    diff = q3 - q1
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    return data[(data[var] < lower_t) | (data[var] > upper_t)]

# iqr(train, "squareMeters")

In [13]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy
    
def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(test)

Unnamed: 0,Outlier_percentage
FineAggregateComponent,8.54369
WaterComponent,8.2663
AgeInDays,7.93343
SuperplasticizerComponent,1.47018
BlastFurnaceSlag,0.41609
id,0.0
CementComponent,0.0
FlyAshComponent,0.0
CoarseAggregateComponent,0.0
is_original,0.0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

## Handle Outliers
- https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
- https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

In [14]:
#https://www.kaggle.com/code/mmellinger66/s3e9-concrete-strength-models

def add_features(df):
    df['Water_Cement'] = df['WaterComponent'] / df['CementComponent']
    df['Coarse_Fine'] = df['CoarseAggregateComponent'] / df['FineAggregateComponent']
    df['Aggregate'] = df['CoarseAggregateComponent'] + df['FineAggregateComponent']
    df['Aggregate_Cement'] = df['Aggregate'] / df['CementComponent']
    df['Slag_Cement'] = df['BlastFurnaceSlag'] / df['CementComponent']
    df['Ash_Cement'] = df['FlyAshComponent'] / df['CementComponent']
    df['Plastic_Cement'] = df['SuperplasticizerComponent'] / df['CementComponent']
    df['Age_Water'] = df['AgeInDays'] / df['WaterComponent']
    return df

train = add_features(train)
test = add_features(test)

In [15]:
# features_with_outliers = []

In [16]:
# https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

def remove_outliers(df:pd.DataFrame) -> pd.DataFrame:
    for c in features_with_outliers:
        if c == 'garage':
            first_percentile = df[c].quantile(0.001)
            df = df[df[c] > first_percentile]

        ninety_ninth_percentile = df[c].quantile(0.999)
        df = df[df[c] < ninety_ninth_percentile]
        #df_t = df_t[(df_t[c] > first_percentile) & (df_t[c] < ninety_ninth_percentile)]
    return df


In [17]:
# print(f'Before: {len(train)}')
# train = remove_outliers(train)
# print(f'After: {len(train)}')

In [18]:
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength,is_original,Water_Cement,Coarse_Fine,Aggregate,Aggregate_Cement,Slag_Cement,Ash_Cement,Plastic_Cement,Age_Water
0,0.0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38,0,0.35429,1.83524,1738.0,3.31048,0.0,0.0,0.0,0.01613
1,1.0,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52,0,1.33566,1.50389,1610.0,11.25874,1.18182,1.0,0.05594,0.1466
2,2.0,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96,0,0.64256,1.35169,1870.3,6.47163,0.46609,0.0,0.0,0.15078
3,3.0,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05,0,0.75,1.39104,1602.0,5.26974,0.25,0.0,0.0,1.60088
4,4.0,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19,0,1.22293,1.19739,1716.6,10.93376,1.50318,0.0,0.0,0.46875
5,5.0,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43,0,0.58,1.36129,1830.0,5.22857,0.0,0.0,0.0,0.03448
6,6.0,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1,0,1.36846,1.41736,1835.5,13.52616,1.49963,0.0,0.0,0.15078
7,7.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94,0,0.68571,1.56902,1526.0,4.58947,0.42857,0.0,0.0,0.12281
8,8.0,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14,0,0.63043,1.2175,1774.0,5.50932,0.0,0.0,0.0,0.8867
9,9.0,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94,0,1.44361,1.1051,1766.6,13.28271,1.50376,0.0,0.0,0.01562


In [19]:
train = train.reset_index(drop=True).copy()
train.head(10)

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength,is_original,Water_Cement,Coarse_Fine,Aggregate,Aggregate_Cement,Slag_Cement,Ash_Cement,Plastic_Cement,Age_Water
0,0.0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38,0,0.35429,1.83524,1738.0,3.31048,0.0,0.0,0.0,0.01613
1,1.0,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52,0,1.33566,1.50389,1610.0,11.25874,1.18182,1.0,0.05594,0.1466
2,2.0,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96,0,0.64256,1.35169,1870.3,6.47163,0.46609,0.0,0.0,0.15078
3,3.0,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05,0,0.75,1.39104,1602.0,5.26974,0.25,0.0,0.0,1.60088
4,4.0,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19,0,1.22293,1.19739,1716.6,10.93376,1.50318,0.0,0.0,0.46875
5,5.0,350.0,0.0,0.0,203.0,0.0,1055.0,775.0,7,37.43,0,0.58,1.36129,1830.0,5.22857,0.0,0.0,0.0,0.03448
6,6.0,135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28,35.1,0,1.36846,1.41736,1835.5,13.52616,1.49963,0.0,0.0,0.15078
7,7.0,332.5,142.5,0.0,228.0,0.0,932.0,594.0,28,45.94,0,0.68571,1.56902,1526.0,4.58947,0.42857,0.0,0.0,0.12281
8,8.0,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180,42.14,0,0.63043,1.2175,1774.0,5.50932,0.0,0.0,0.0,0.8867
9,9.0,133.0,200.0,0.0,192.0,0.0,927.4,839.2,3,6.94,0,1.44361,1.1051,1766.6,13.28271,1.50376,0.0,0.0,0.01562


In [20]:
excluded_features = [TARGET, ID, "fold", "is_original"]

In [21]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'CementComponent', 'BlastFurnaceSlag', 'FlyAshComponent', 'WaterComponent', 'SuperplasticizerComponent', 'CoarseAggregateComponent', 'FineAggregateComponent', 'AgeInDays', 'Strength', 'is_original', 'Water_Cement', 'Coarse_Fine', 'Aggregate', 'Aggregate_Cement', 'Slag_Cement', 'Ash_Cement', 'Plastic_Cement', 'Age_Water']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['CementComponent',
 'BlastFurnaceSlag',
 'FlyAshComponent',
 'WaterComponent',
 'SuperplasticizerComponent',
 'CoarseAggregateComponent',
 'FineAggregateComponent',
 'AgeInDays',
 'Water_Cement',
 'Coarse_Fine',
 'Aggregate',
 'Aggregate_Cement',
 'Slag_Cement',
 'Ash_Cement',
 'Plastic_Cement',
 'Age_Water']

In [22]:
train, test = label_encoder(train, test, cat_features)
# train = pd.get_dummies(train,columns=[]) # Will remove original feature names
# test = pd.get_dummies(test,columns=[])

In [23]:
train.head()

Unnamed: 0,id,CementComponent,BlastFurnaceSlag,FlyAshComponent,WaterComponent,SuperplasticizerComponent,CoarseAggregateComponent,FineAggregateComponent,AgeInDays,Strength,is_original,Water_Cement,Coarse_Fine,Aggregate,Aggregate_Cement,Slag_Cement,Ash_Cement,Plastic_Cement,Age_Water
0,0.0,525.0,0.0,0.0,186.0,0.0,1125.0,613.0,3,10.38,0,0.35429,1.83524,1738.0,3.31048,0.0,0.0,0.0,0.01613
1,1.0,143.0,169.0,143.0,191.0,8.0,967.0,643.0,28,23.52,0,1.33566,1.50389,1610.0,11.25874,1.18182,1.0,0.05594,0.1466
2,2.0,289.0,134.7,0.0,185.7,0.0,1075.0,795.3,28,36.96,0,0.64256,1.35169,1870.3,6.47163,0.46609,0.0,0.0,0.15078
3,3.0,304.0,76.0,0.0,228.0,0.0,932.0,670.0,365,39.05,0,0.75,1.39104,1602.0,5.26974,0.25,0.0,0.0,1.60088
4,4.0,157.0,236.0,0.0,192.0,0.0,935.4,781.2,90,74.19,0,1.22293,1.19739,1716.6,10.93376,1.50318,0.0,0.0,0.46875


In [24]:
# if Config.debug:
#     train = train.sample(frac=0.4, random_state=Config.seed)
#     train.reset_index()
#     print(f"Debug Shape: {train.shape}")

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [25]:
%%time

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best XGB trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best LGBM trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
# best_cb_params = {}
best_cb_params = {'learning_rate': 0.45743264601999495,
                  'l2_leaf_reg': 41.338946049390074,
                  'bagging_temperature': 0.3472567739474319,
                  'random_strength': 1.7332249677756242, 
                  'depth': 1,
                  'min_data_in_leaf': 6}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best Cat trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 11.9 µs


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [26]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [27]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            early_stopping_rounds=500,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [28]:
def run_linear_model(train:pd.DataFrame, test:pd.DataFrame, model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(train:pd.DataFrame, test:pd.DataFrame, model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [29]:
%%time

def run_models4features(train:pd.DataFrame, test:pd.DataFrame, model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(train, test, model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(train, test, model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time, "n_estimators": Config.N_ESTIMATORS, "n_folds": Config.N_FOLDS, "comments": ""}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 13 µs, sys: 1e+03 ns, total: 14 µs
Wall time: 18.8 µs


In [30]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params3 = {
    "n_estimators": Config.N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}
    
lgbm_params = gpu_ify_lgbm(lgbm_params)

In [31]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror",
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params3 = {
    'n_estimators': Config.N_ESTIMATORS,
    'learning_rate': 0.05,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror'
}

if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [32]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)

In [33]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),
    "xgb3": xgb.XGBRegressor(**xgb_params3),


    "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm0": lgb.LGBMRegressor(),
    "lgbm3": lgb.LGBMRegressor(lgbm_params3),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),
}

In [34]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
        "n_estimators": pd.Series(dtype="int"),
        "n_folds": pd.Series(dtype="int"),
        "comments": pd.Series(dtype="str"),
    }
)



## Tree Models

In [35]:
%%time

# model_lst = ["xgb3","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
# model_lst = = []

def run_tree_models(X_tr, test, n_folds, model_lst, all_cv_scores):
        all_cv_scores = run_models4features(X_tr, test, model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

        all_cv_scores.sort_values(by=["Score"], ascending=False)
        return all_cv_scores

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 12.6 µs


## Linear Models

In [36]:
def run_linear_models(X_tr, test, n_folds, model_lst, all_cv_scores):
    for training in [train, train]:

    #     all_cv_scores = run_models4features(train, test, model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    
        all_cv_scores = run_models4features(X_tr, test, model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
        return all_cv_scores  

In [37]:
%%time


tree_model_lst = ["xgb_best_params", "lgbm_best_params", "cat_best_params","xgb3", "xgb1", "xgb2", "lgbm0", "lgbm1", "lgbm2", "lgbm3", "cat1", "cat2"]
linear_model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
linear_model_lst = ["lasso", "ridge",  "ridge_50"]

# Config.N_FOLDS = 10

for training in [train]:
    X_tr = create_folds(train, Config.N_FOLDS)
    
    oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)
    oof.head()

    all_cv_scores = run_tree_models(X_tr, test, Config.N_FOLDS, tree_model_lst, all_cv_scores)
    all_cv_scores = run_linear_models(X_tr, test, Config.N_FOLDS, linear_model_lst, all_cv_scores)
#     Config.N_FOLDS = 5



n_folds=5, seed=42
Model=xgb_best_params
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[37.290733 17.446136 46.75457  46.99444  49.377964 38.348175 18.791159
 20.722895 17.127117 60.69019  13.919984 33.58592  31.581871 43.638187
 20.709562 39.83987  47.32892  11.991572 42.45572  38.31584 ]
Preds Prob:[37.290733 17.446136 46.75457  46.99444  49.377964 38.348175 18.791159
 20.722895 17.127117 60.69019  13.919984 33.58592  31.581871 43.638187
 20.709562 39.83987  47.32892  11.991572 42.45572  38.31584 ]
fold: 1, Score: 8.928546607923805, Run Time: 0.88
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.15 21.92 64.9  41.93 37.42 11.17 21.48]
Preds Class:[27.66401  24.66494  18.784597 55.12869  30.421904 12.60

Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0.0,25.26416
1.0,30.24642
2.0,38.82778
3.0,43.63323
4.0,44.63012


Mode
=== Target Value Counts ===
Model Run Time: 4.51
Model=lgbm_best_params
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[37.48204222 18.44791179 49.75628814 46.67716463 49.84707567 37.66515732
 19.24091425 22.94024614 17.84140429 59.66023385 16.1157631  36.0829894
 32.67188009 40.22440007 20.22857864 39.93096297 45.34589932 14.38156882
 40.86247044 36.97599878]
Preds Prob:[37.48204222 18.44791179 49.75628814 46.67716463 49.84707567 37.66515732
 19.24091425 22.94024614 17.84140429 59.66023385 16.1157631  36.0829894
 32.67188009 40.22440007 20.22857864 39.93096297 45.34589932 14.38156882
 40.86247044 36.97599878]
fold: 1, Score: 8.883048414100395, Run Time: 0.76
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 3

Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0.0,24.72608
1.0,32.88806
2.0,39.82932
3.0,44.72587
4.0,46.22369


Mode
=== Target Value Counts ===
Model Run Time: 3.83
Model=cat_best_params
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[38.52598572 13.76837741 46.30307483 47.29183163 50.01663535 35.51647455
 19.90312348 21.23911932 19.04485395 62.64636208 12.81665641 36.89536167
 32.53778123 43.10080093 19.07256925 39.31630821 47.65803942 11.74819666
 39.16841351 34.34387225]
Preds Prob:[38.52598572 13.76837741 46.30307483 47.29183163 50.01663535 35.51647455
 19.90312348 21.23911932 19.04485395 62.64636208 12.81665641 36.89536167
 32.53778123 43.10080093 19.07256925 39.31630821 47.65803942 11.74819666
 39.16841351 34.34387225]
fold: 1, Score: 8.822272830446046, Run Time: 1.08
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 

Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0.0,21.84307
1.0,33.18813
2.0,38.51692
3.0,46.40614
4.0,44.52899


Mode
=== Target Value Counts ===
Model Run Time: 4.80
Model=xgb3
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[35.681656 18.155376 42.44488  47.412125 50.646862 39.42715  18.557667
 22.116959 16.71818  60.89937  12.747934 36.396523 31.352144 35.571068
 18.199402 40.6724   46.48277   9.625642 43.013416 36.91306 ]
Preds Prob:[35.681656 18.155376 42.44488  47.412125 50.646862 39.42715  18.557667
 22.116959 16.71818  60.89937  12.747934 36.396523 31.352144 35.571068
 18.199402 40.6724   46.48277   9.625642 43.013416 36.91306 ]
fold: 1, Score: 9.088879551221126, Run Time: 3.79
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.15 21.92 64.9  41.93 37.42 11.17 21.48]
Preds Class:[20.914536 18.517403 18.94853  5

Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0.0,24.48314
1.0,31.66614
2.0,37.49663
3.0,43.4591
4.0,44.41072


Mode
=== Target Value Counts ===
Model Run Time: 19.53
Model=xgb1
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[37.290733 17.446136 46.75457  46.99444  49.377964 38.348175 18.791159
 20.722895 17.127117 60.69019  13.919984 33.58592  31.581871 43.638187
 20.709562 39.83987  47.32892  11.991572 42.45572  38.31584 ]
Preds Prob:[37.290733 17.446136 46.75457  46.99444  49.377964 38.348175 18.791159
 20.722895 17.127117 60.69019  13.919984 33.58592  31.581871 43.638187
 20.709562 39.83987  47.32892  11.991572 42.45572  38.31584 ]
fold: 1, Score: 8.928546607923805, Run Time: 1.29
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.15 21.92 64.9  41.93 37.42 11.17 21.48]
Preds Class:[27.66401  24.66494  18.784597 

Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,25.26416
1.0,30.24642
2.0,38.82778
3.0,43.63323
4.0,44.63012


Mode
=== Target Value Counts ===
Model Run Time: 6.68
Model=xgb2
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[35.765453 16.86398  41.816795 47.03218  50.911423 38.610622 18.313627
 20.86037  17.904041 60.989487 12.935455 36.796585 36.172276 39.023567
 18.480381 40.956165 45.829     9.53947  43.07937  39.048344]
Preds Prob:[35.765453 16.86398  41.816795 47.03218  50.911423 38.610622 18.313627
 20.86037  17.904041 60.989487 12.935455 36.796585 36.172276 39.023567
 18.480381 40.956165 45.829     9.53947  43.07937  39.048344]
fold: 1, Score: 9.126492900063532, Run Time: 3.66
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.15 21.92 64.9  41.93 37.42 11.17 21.48]
Preds Class:[23.924007 22.69475  19.368305 5

Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,25.92812
1.0,31.73649
2.0,40.52326
3.0,44.10012
4.0,45.09132


Mode
=== Target Value Counts ===
Model Run Time: 18.53
Model=lgbm0
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[37.48204222 18.44791179 49.75628814 46.67716463 49.84707567 37.66515732
 19.24091425 22.94024614 17.84140429 59.66023385 16.1157631  36.0829894
 32.67188009 40.22440007 20.22857864 39.93096297 45.34589932 14.38156882
 40.86247044 36.97599878]
Preds Prob:[37.48204222 18.44791179 49.75628814 46.67716463 49.84707567 37.66515732
 19.24091425 22.94024614 17.84140429 59.66023385 16.1157631  36.0829894
 32.67188009 40.22440007 20.22857864 39.93096297 45.34589932 14.38156882
 40.86247044 36.97599878]
fold: 1, Score: 8.883048414100395, Run Time: 0.63
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.15

Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0.0,24.72608
1.0,32.88806
2.0,39.82932
3.0,44.72587
4.0,46.22369


Mode
=== Target Value Counts ===
Model Run Time: 3.72
Model=lgbm1
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[37.96149043 19.05160591 48.29448034 47.06499368 48.26988394 36.24650627
 19.76855986 22.06184663 18.03116026 60.4545058  16.09996667 35.77822445
 33.30717209 41.89914121 20.92955075 40.99815666 44.39435647 15.45408975
 41.10928917 35.88984804]
Preds Prob:[37.96149043 19.05160591 48.29448034 47.06499368 48.26988394 36.24650627
 19.76855986 22.06184663 18.03116026 60.4545058  16.09996667 35.77822445
 33.30717209 41.89914121 20.92955075 40.99815666 44.39435647 15.45408975
 41.10928917 35.88984804]
fold: 1, Score: 8.930993237037617, Run Time: 0.81
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.1

Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,22.35015
1.0,36.20098
2.0,38.86616
3.0,44.03752
4.0,46.99747


Mode
=== Target Value Counts ===
Model Run Time: 4.24
Model=lgbm2
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[38.39296193 18.73345165 47.77118871 46.71540619 48.57743843 35.91762485
 19.09713451 22.6580198  20.07032439 60.27277427 15.35374329 35.79039056
 32.85861485 41.86033963 19.99522386 39.28068752 45.43929031 14.81940399
 41.10633302 35.69400165]
Preds Prob:[38.39296193 18.73345165 47.77118871 46.71540619 48.57743843 35.91762485
 19.09713451 22.6580198  20.07032439 60.27277427 15.35374329 35.79039056
 32.85861485 41.86033963 19.99522386 39.28068752 45.43929031 14.81940399
 41.10633302 35.69400165]
fold: 1, Score: 8.817536679420314, Run Time: 0.32
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.1

Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,21.20956
1.0,34.53058
2.0,37.42513
3.0,44.77962
4.0,44.82637


Mode
=== Target Value Counts ===
Model Run Time: 1.72
Model=lgbm3
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[37.96149043 19.05160591 48.29448034 47.06499368 48.26988394 36.24650627
 19.76855986 22.06184663 18.03116026 60.4545058  16.09996667 35.77822445
 33.30717209 41.89914121 20.92955075 40.99815666 44.39435647 15.45408975
 41.10928917 35.88984804]
Preds Prob:[37.96149043 19.05160591 48.29448034 47.06499368 48.26988394 36.24650627
 19.76855986 22.06184663 18.03116026 60.4545058  16.09996667 35.77822445
 33.30717209 41.89914121 20.92955075 40.99815666 44.39435647 15.45408975
 41.10928917 35.88984804]
fold: 1, Score: 8.930993237037617, Run Time: 0.81
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.1

Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0.0,22.35015
1.0,36.20098
2.0,38.86616
3.0,44.03752
4.0,46.99747


Mode
=== Target Value Counts ===
Model Run Time: 4.30
Model=cat1
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[38.58926497 17.29687491 48.67647061 46.99469627 49.66594314 34.59005119
 18.91365798 22.72324008 15.8336231  61.39735208 14.65955953 33.67143228
 34.55424333 41.54890264 19.8755371  38.65401281 46.14619861 11.56770398
 42.91310622 35.82160499]
Preds Prob:[38.58926497 17.29687491 48.67647061 46.99469627 49.66594314 34.59005119
 18.91365798 22.72324008 15.8336231  61.39735208 14.65955953 33.67143228
 34.55424333 41.54890264 19.8755371  38.65401281 46.14619861 11.56770398
 42.91310622 35.82160499]
fold: 1, Score: 8.75598934058948, Run Time: 1.93
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.15 

Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,23.36917
1.0,31.13019
2.0,36.83854
3.0,45.07084
4.0,45.22495


Mode
=== Target Value Counts ===
Model Run Time: 10.39
Model=cat2
{}
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[42.14 13.18 33.8  38.7  45.3  19.01 28.99 17.54  8.37 57.22 25.12 16.88
 24.24 33.94 15.36 37.43 45.9  14.31 31.84 21.26]
Preds Class:[38.73376977 19.14419108 46.77445823 47.09246906 49.48357059 34.84519179
 17.92465158 22.86653217 17.20141398 60.79149043 15.73509515 35.02286691
 35.35521734 41.99715364 20.39172335 39.258505   46.11155621 13.12025513
 42.25236754 36.55760733]
Preds Prob:[38.73376977 19.14419108 46.77445823 47.09246906 49.48357059 34.84519179
 17.92465158 22.86653217 17.20141398 60.79149043 15.73509515 35.02286691
 35.35521734 41.99715364 20.39172335 39.258505   46.11155621 13.12025513
 42.25236754 36.55760733]
fold: 1, Score: 8.783210254474545, Run Time: 4.37
GT Type: <class 'numpy.ndarray'>
Preds Type: <class 'numpy.ndarray'>
         GT:[23.52 17.96 15.09 52.52 29.07  9.69 15.09 11.39 28.24 35.36 12.55 39.6
 35.86 40.1

Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,23.11491
1.0,31.77336
2.0,37.77528
3.0,44.24992
4.0,44.79392


Mode
=== Target Value Counts ===
Model Run Time: 22.43
Model=lasso
fold: 1, Score: 11.115643340722372, Run Time: 0.04
fold: 2, Score: 11.001168870318764, Run Time: 0.07
fold: 3, Score: 11.747670633136062, Run Time: 0.06
fold: 4, Score: 11.555320367018686, Run Time: 0.09
fold: 5, Score: 11.227621805637167, Run Time: 0.09
Scores -> Adjusted: 11.05028017 , mean: 11.32948500, std: 0.27920483


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,33.14186
1.0,33.82038
2.0,31.61445
3.0,53.17541
4.0,35.01962


Mode
=== Target Value Counts ===
Model Run Time: 0.58
Model=ridge
fold: 1, Score: 10.770998025719994, Run Time: 0.05
fold: 2, Score: 10.654217451144538, Run Time: 0.07
fold: 3, Score: 11.39513807254392, Run Time: 0.08
fold: 4, Score: 11.048466221614829, Run Time: 0.08
fold: 5, Score: 10.741750226650721, Run Time: 0.09
Scores -> Adjusted: 10.65124416 , mean: 10.92211400, std: 0.27086984


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,31.22234
1.0,31.45842
2.0,31.82823
3.0,47.00402
4.0,36.99448


Mode
=== Target Value Counts ===
Model Run Time: 0.60
Model=ridge_50
fold: 1, Score: 10.771532659095, Run Time: 0.04
fold: 2, Score: 10.652348889835322, Run Time: 0.07
fold: 3, Score: 11.394785928566236, Run Time: 0.06
fold: 4, Score: 11.047541477895903, Run Time: 0.10
fold: 5, Score: 10.739797290566475, Run Time: 0.08
Scores -> Adjusted: 10.64996925 , mean: 10.92120125, std: 0.27123200


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,31.18599
1.0,31.4294
2.0,31.82143
3.0,46.79551
4.0,37.00087


Mode
=== Target Value Counts ===
Model Run Time: 0.59
CPU times: user 3min 29s, sys: 23.5 s, total: 3min 52s
Wall time: 1min 46s


In [38]:
sample_submission.head(20)

Unnamed: 0,id,Strength,target_xgb_best_params,target_lgbm_best_params,target_cat_best_params,target_xgb3,target_xgb1,target_xgb2,target_lgbm0,target_lgbm1,target_lgbm2,target_lgbm3,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,5407,35.452,46.90178,47.15996,44.97202,45.05516,46.90178,40.78519,47.15996,46.96822,47.47991,46.96822,46.26873,46.33394,33.95512,34.33324,34.34525
1,5408,35.452,18.19524,18.65615,18.09163,19.84968,18.19524,17.66659,18.65615,18.95696,19.12712,18.95696,17.98938,18.70558,28.83874,26.34597,26.34301
2,5409,35.452,31.33314,31.50406,31.4321,30.05576,31.33314,29.81802,31.50406,31.87039,32.33526,31.87039,31.18649,32.31088,28.56092,25.86969,25.87289
3,5410,35.452,44.80167,45.3459,47.09857,45.05301,44.80167,45.26397,45.3459,44.39436,45.43929,44.39436,46.11544,46.09568,38.79214,39.6157,39.63171
4,5411,35.452,20.05029,20.36055,23.86701,16.22913,20.05029,16.30435,20.36055,24.77768,24.84882,24.77768,20.91831,25.65119,30.7729,27.72353,27.72285
5,5412,35.452,34.02267,43.78796,40.44463,37.65022,34.02267,34.28682,43.78796,41.06641,41.01557,41.06641,39.20178,39.67758,34.44334,35.21317,35.21997
6,5413,35.452,24.0489,31.01874,31.1069,28.74008,24.0489,30.60802,31.01874,29.56967,32.08405,29.56967,33.36652,33.24956,29.04961,27.88248,27.89806
7,5414,35.452,21.72809,21.86013,20.67879,19.86624,21.72809,19.58776,21.86013,20.8414,21.5005,20.8414,20.5516,21.78558,30.28427,31.7795,31.77644
8,5415,35.452,40.13879,50.40358,44.92774,40.91749,40.13879,41.61526,50.40358,48.86537,46.80123,48.86537,47.62136,44.10945,36.90388,38.64076,38.59915
9,5416,35.452,35.65934,36.8334,37.58743,33.4528,35.65934,33.11555,36.8334,36.20682,38.80319,36.20682,38.26219,37.64488,31.53369,32.75968,32.78027


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [39]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [40]:
model_lst = ["xgb3", "xgb2", "cat1", "lgbm0", "lgbm1","lgbm2", "cat_best_params", "lgbm_best_params"]

In [41]:
len(model_lst)

8

In [42]:
target_names = [f"target_{model}" for model in model_lst]
target_names

['target_xgb3',
 'target_xgb2',
 'target_cat1',
 'target_lgbm0',
 'target_lgbm1',
 'target_lgbm2',
 'target_cat_best_params',
 'target_lgbm_best_params']

In [43]:
sample_submission[TARGET] = sample_submission[target_names].sum(axis=1) / len(model_lst)

In [44]:
sample_submission[[ID, TARGET]].to_csv("submission_equal_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,18.09348
3598,9005,39.39181
3599,9006,17.24996
3600,9007,30.98815
3601,9008,31.12086
3602,9009,40.37673
3603,9010,31.65528
3604,9011,22.20564


In [45]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_cat_best_params"] * 3 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

# sample_submission[TARGET] = sample_submission[TARGET].astype(int)

In [46]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,Strength
3597,9004,17.90331
3598,9005,40.03501
3599,9006,14.51591
3600,9007,30.41383
3601,9008,33.40695
3602,9009,39.31757
3603,9010,34.60671
3604,9011,20.10319


In [47]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime,n_estimators,n_folds,comments
12,lasso,11.32949,0.2792,0.58455,200,5,
13,ridge,10.92211,0.27087,0.59578,200,5,
14,ridge_50,10.9212,0.27123,0.59407,200,5,
5,xgb2,9.12337,0.11768,18.52806,200,5,
3,xgb3,9.12175,0.12251,19.53371,200,5,
0,xgb_best_params,9.02542,0.13785,4.51308,200,5,
4,xgb1,9.02542,0.13785,6.68058,200,5,
7,lgbm1,8.99138,0.08434,4.24044,200,5,
9,lgbm3,8.99138,0.08434,4.30166,200,5,
1,lgbm_best_params,8.99094,0.12269,3.83065,200,5,


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [48]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [49]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [50]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
    id  pred_cat1
0  0.0   23.36917
1  1.0   31.13019
2  2.0   36.83854
3  3.0   45.07084
4  4.0   45.22495
Processing train_pred_cat2, train_pred_cat2.csv
    id  pred_cat2
0  0.0   23.11491
1  1.0   31.77336
2  2.0   37.77528
3  3.0   44.24992
4  4.0   44.79392
Processing train_pred_lgbm1, train_pred_lgbm1.csv
    id  pred_lgbm1
0  0.0    22.35015
1  1.0    36.20098
2  2.0    38.86616
3  3.0    44.03752
4  4.0    46.99747
Processing train_pred_lgbm2, train_pred_lgbm2.csv
    id  pred_lgbm2
0  0.0    21.20956
1  1.0    34.53058
2  2.0    37.42513
3  3.0    44.77962
4  4.0    44.82637
Processing train_pred_xgb1, train_pred_xgb1.csv
    id  pred_xgb1
0  0.0   25.26416
1  1.0   30.24642
2  2.0   38.82778
3  3.0   43.63323
4  4.0   44.63012
submission_cat1, submission_cat1.csv
     id  Strength
0  5407  46.26873
1  5408  17.98938
2  5409  31.18649
3  5410  46.11544
4  5411  20.91831
submission_cat2, submission_cat2.csv
     id  Strength
0  5

In [51]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,23.36917,23.11491,22.35015,21.20956,25.26416
1,31.13019,31.77336,36.20098,34.53058,30.24642
2,36.83854,37.77528,38.86616,37.42513,38.82778
3,45.07084,44.24992,44.03752,44.77962,43.63323
4,45.22495,44.79392,46.99747,44.82637,44.63012


In [52]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,46.26873,46.33394,46.96822,47.47991,46.90178
1,17.98938,18.70558,18.95696,19.12712,18.19524
2,31.18649,32.31088,31.87039,32.33526,31.33314
3,46.11544,46.09568,44.39436,45.43929,44.80167
4,20.91831,25.65119,24.77768,24.84882,20.05029


In [53]:
type(preds_df)

pandas.core.frame.DataFrame

In [54]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [55]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [56]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,23.36917,23.11491,22.35015,21.20956,25.26416
1,31.13019,31.77336,36.20098,34.53058,30.24642
2,36.83854,37.77528,38.86616,37.42513,38.82778
3,45.07084,44.24992,44.03752,44.77962,43.63323
4,45.22495,44.79392,46.99747,44.82637,44.63012


In [57]:
# preds_df[useful_features].head()

In [58]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [59]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [60]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime,n_estimators,n_folds,comments
12,lasso,11.33,0.28,0.58,200,5,
13,ridge,10.92,0.27,0.6,200,5,
14,ridge_50,10.92,0.27,0.59,200,5,
5,xgb2,9.12,0.12,18.53,200,5,
3,xgb3,9.12,0.12,19.53,200,5,
0,xgb_best_params,9.03,0.14,4.51,200,5,
4,xgb1,9.03,0.14,6.68,200,5,
7,lgbm1,8.99,0.08,4.24,200,5,
9,lgbm3,8.99,0.08,4.3,200,5,
1,lgbm_best_params,8.99,0.12,3.83,200,5,


In [61]:
if Config.debug:
    print("*** Don't submit debug mode.  Only using partial training data ***")

*** Don't submit debug mode.  Only using partial training data ***
