<a href="https://www.kaggle.com/code/mmellinger66/ps3e6-paris-housing-models?scriptVersionId=119774256" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 6 - Paris Housing Prices</h1>
</div>

## Problem Type

Regression

## Evaluation Metric


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="price"
ID="id"

In [3]:
class Config:
    path:str = "../input/playground-series-s3e6/"
    gpu:bool = True
    optimize:bool = True
    n_optuna_trials:int = 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 2000  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    "Deprecated: Use summary_statistics()"
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    lgbm_dict["device"] = "gpu"
    lgbm_dict["boosting_type"] = "gbdt"
    lgbm_dict["gpu_platform_id"] = 0
    lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
        "eval_metric": "rmse",  # auc, rmse, mae
        "objective": "reg:squarederror",
        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["gpu_hist"]
        ),  # hist, gpu_hist
        "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    #  model = CatBoostClassifier(**cb_params)
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=22730, Columns=18
 test data : Rows=15154, Columns=17
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,0,34291,24,1,0,47,35693,2,1,2000,0,1,8,5196,369,0,3,3436795.2
1,1,95145,60,0,1,60,34773,1,4,2000,0,1,729,4496,277,0,6,9519958.0
2,2,92661,45,1,1,62,45457,4,8,2020,1,1,7473,8953,245,1,9,9276448.1
3,3,97184,99,0,0,59,15113,1,1,2000,0,1,6424,8522,256,1,9,9725732.2
4,4,61752,100,0,0,57,64245,8,4,2018,1,0,7151,2786,863,0,7,6181908.8



[1m[91m=== Train Column Names ===[0m



Index(['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors',
       'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt',
       'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom',
       'hasGuestRoom', 'price'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors', 'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt', 'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom', 'price']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

floors               85.12233
squareMeters         79.25331
made                 66.93441
basement              3.33664
attic                 2.80996
garage                1.37876
cityCode              0.24580
hasPool               0.18986
hasStormProtector     0.16049
hasStorageRoom        0.15374
price                 0.13140
isNewBuilt            0.12838
numberOfRooms         0.11508
hasYard               0.09656
id                    0.00000
numPrevOwners        -0.07711
cityPartRange        -0.09306
hasGuestRoom         -0.11777
dtype: float64
CPU t

In [8]:
train.head()

Unnamed: 0,id,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,0,34291,24,1,0,47,35693,2,1,2000,0,1,8,5196,369,0,3,3436795.2
1,1,95145,60,0,1,60,34773,1,4,2000,0,1,729,4496,277,0,6,9519958.0
2,2,92661,45,1,1,62,45457,4,8,2020,1,1,7473,8953,245,1,9,9276448.1
3,3,97184,99,0,0,59,15113,1,1,2000,0,1,6424,8522,256,1,9,9725732.2
4,4,61752,100,0,0,57,64245,8,4,2018,1,0,7151,2786,863,0,7,6181908.8


In [9]:
original = pd.read_csv("../input/paris-housing-price-prediction/ParisHousing.csv")

original.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


In [10]:
original = original.reset_index()
original['id'] = original['index'] + 1000000
original = original.drop(columns = ['index'])
original.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,id
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5,1000000
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5,1000001
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1,1000002
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2,1000003
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0,1000004


In [11]:
train['is_original']    = 0
test['is_original']     = 0
original['is_original'] = 1
combined = pd.concat([train, original], ignore_index=True)
train = combined

In [12]:
combined.head()

Unnamed: 0,id,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,is_original
0,0,34291,24,1,0,47,35693,2,1,2000,0,1,8,5196,369,0,3,3436795.2,0
1,1,95145,60,0,1,60,34773,1,4,2000,0,1,729,4496,277,0,6,9519958.0,0
2,2,92661,45,1,1,62,45457,4,8,2020,1,1,7473,8953,245,1,9,9276448.1,0
3,3,97184,99,0,0,59,15113,1,1,2000,0,1,6424,8522,256,1,9,9725732.2,0
4,4,61752,100,0,0,57,64245,8,4,2018,1,0,7151,2786,863,0,7,6181908.8,0


In [13]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
squareMeters,32730.0,47589.55,44252.7,89.0,21581.0,46132.0,72545.0,6071330.0,1958301742.77,77.09,10489.2
numberOfRooms,32730.0,48.89,28.42,1.0,25.0,48.0,75.0,100.0,807.94,0.09,-1.16
hasYard,32730.0,0.49,0.5,0.0,0.0,0.0,1.0,1.0,0.25,0.06,-2.0
hasPool,32730.0,0.47,0.5,0.0,0.0,0.0,1.0,1.0,0.25,0.14,-1.98
floors,32730.0,48.21,42.92,1.0,25.0,46.0,72.0,6000.0,1842.06,81.51,11298.59
cityCode,32730.0,50078.47,29704.41,3.0,23446.0,50452.0,76229.0,491100.0,882351901.11,0.17,1.43
cityPartRange,32730.0,5.56,2.78,1.0,3.0,6.0,8.0,10.0,7.73,-0.06,-1.17
numPrevOwners,32730.0,5.59,2.76,1.0,3.0,6.0,8.0,10.0,7.61,-0.05,-1.14
made,32730.0,2007.24,99.16,1990.0,2000.0,2006.0,2014.0,10000.0,9833.46,80.0,6446.0
isNewBuilt,32730.0,0.48,0.5,0.0,0.0,0.0,1.0,1.0,0.25,0.09,-1.99


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

In [14]:
train.drop(['cityCode'], axis=1, inplace=True)
test.drop(['cityCode'], axis=1, inplace=True)


In [15]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features.remove(TARGET)
cont_features.remove(ID)
FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt', 'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom', 'price', 'is_original']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['squareMeters',
 'numberOfRooms',
 'hasYard',
 'hasPool',
 'floors',
 'cityPartRange',
 'numPrevOwners',
 'made',
 'isNewBuilt',
 'hasStormProtector',
 'basement',
 'attic',
 'garage',
 'hasStorageRoom',
 'hasGuestRoom',
 'is_original']

In [16]:
excluded_features = [TARGET, ID, "fold"]

In [17]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt', 'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom', 'price', 'is_original']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['squareMeters',
 'numberOfRooms',
 'hasYard',
 'hasPool',
 'floors',
 'cityPartRange',
 'numPrevOwners',
 'made',
 'isNewBuilt',
 'hasStormProtector',
 'basement',
 'attic',
 'garage',
 'hasStorageRoom',
 'hasGuestRoom',
 'is_original']

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [18]:
%%time

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
best_cb_params = {}

if Config.optimize:
    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

[32m[I 2023-02-20 17:56:09,804][0m A new study created in memory with name: no-name-16ce9555-c8a0-4132-abf0-4f380086f776[0m
[32m[I 2023-02-20 17:57:08,218][0m Trial 0 finished with value: 215330.73989893027 and parameters: {'use_label_encoder': False, 'n_estimators': 3200, 'learning_rate': 0.07502172100141696, 'subsample': 0.36, 'colsample_bytree': 0.8500000000000001, 'max_depth': 12, 'gamma': 94.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.10631323726463023, 'reg_alpha': 4.267395112269372, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 9.321034714762156}. Best is trial 0 with value: 215330.73989893027.[0m


Number of boosting rounds: 324


[32m[I 2023-02-20 17:57:14,278][0m Trial 1 finished with value: 184553.72935743703 and parameters: {'use_label_encoder': False, 'n_estimators': 4700, 'learning_rate': 0.018802691772853444, 'subsample': 0.45000000000000007, 'colsample_bytree': 0.05, 'max_depth': 3, 'gamma': 14.0, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0006052691569581149, 'reg_alpha': 6.472908437565326e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.1129203944253971}. Best is trial 0 with value: 215330.73989893027.[0m


Number of boosting rounds: 4699


[32m[I 2023-02-20 17:59:41,732][0m Trial 2 finished with value: 675995.5290580511 and parameters: {'use_label_encoder': False, 'n_estimators': 2600, 'learning_rate': 0.1571476590255056, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.7500000000000001, 'max_depth': 20, 'gamma': 11.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.054906656486128, 'reg_alpha': 2.135308819258508e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.4106216621070111}. Best is trial 2 with value: 675995.5290580511.[0m


Number of boosting rounds: 49


[32m[I 2023-02-20 17:59:46,816][0m Trial 3 finished with value: 173278.14002114433 and parameters: {'use_label_encoder': False, 'n_estimators': 4200, 'learning_rate': 0.01658929035149611, 'subsample': 0.95, 'colsample_bytree': 0.5900000000000001, 'max_depth': 2, 'gamma': 31.0, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.00021422883282107573, 'reg_alpha': 0.12190934663752785, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 156.23971288286614}. Best is trial 2 with value: 675995.5290580511.[0m


Number of boosting rounds: 3962


[32m[I 2023-02-20 17:59:49,124][0m Trial 4 finished with value: 171880.7917191851 and parameters: {'use_label_encoder': False, 'n_estimators': 1600, 'learning_rate': 0.10330029504156692, 'subsample': 0.87, 'colsample_bytree': 0.6900000000000001, 'max_depth': 3, 'gamma': 48.300000000000004, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 2.787121754466103e-06, 'reg_alpha': 0.6245507775881187, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.9234336276245387}. Best is trial 2 with value: 675995.5290580511.[0m


Number of boosting rounds: 1595


[32m[I 2023-02-20 17:59:57,216][0m Trial 5 finished with value: 172660.11530749904 and parameters: {'use_label_encoder': False, 'n_estimators': 4800, 'learning_rate': 0.020274995655244696, 'subsample': 0.84, 'colsample_bytree': 0.91, 'max_depth': 4, 'gamma': 23.6, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.0288708464656183e-05, 'reg_alpha': 9.520848769226495e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 221.13243838541203}. Best is trial 2 with value: 675995.5290580511.[0m


Number of boosting rounds: 1609


[32m[I 2023-02-20 18:00:28,607][0m Trial 6 finished with value: 174016.16517433972 and parameters: {'use_label_encoder': False, 'n_estimators': 4300, 'learning_rate': 0.028299518544844953, 'subsample': 0.39, 'colsample_bytree': 0.7500000000000001, 'max_depth': 10, 'gamma': 31.8, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.8281454727846378, 'reg_alpha': 47.27392973116712, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 32.360614039308174}. Best is trial 2 with value: 675995.5290580511.[0m


Number of boosting rounds: 2805


[32m[I 2023-02-20 18:00:32,883][0m Trial 7 finished with value: 188409.76970888898 and parameters: {'use_label_encoder': False, 'n_estimators': 3000, 'learning_rate': 0.17056210808777478, 'subsample': 0.13, 'colsample_bytree': 0.11, 'max_depth': 7, 'gamma': 78.7, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 2.247288283769076e-08, 'reg_alpha': 3.966587871754978e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 139.99931961046923}. Best is trial 2 with value: 675995.5290580511.[0m


Number of boosting rounds: 2678


[32m[I 2023-02-20 18:01:45,451][0m Trial 8 finished with value: 224879.72610197926 and parameters: {'use_label_encoder': False, 'n_estimators': 2900, 'learning_rate': 0.037820196205351965, 'subsample': 0.52, 'colsample_bytree': 0.65, 'max_depth': 14, 'gamma': 21.400000000000002, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.4007434509380636, 'reg_alpha': 2.8608324632913932e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 17.216659787381783}. Best is trial 2 with value: 675995.5290580511.[0m


Number of boosting rounds: 982


[32m[I 2023-02-20 18:01:51,294][0m Trial 9 finished with value: 181002.6057389543 and parameters: {'use_label_encoder': False, 'n_estimators': 4800, 'learning_rate': 0.021361043493784412, 'subsample': 0.13, 'colsample_bytree': 0.7500000000000001, 'max_depth': 2, 'gamma': 16.3, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.8925135817530905e-08, 'reg_alpha': 1.1385047232021919, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 122.97844168941063}. Best is trial 2 with value: 675995.5290580511.[0m


Number of boosting rounds: 4657


[32m[I 2023-02-20 18:08:17,885][0m Trial 10 finished with value: 514082.07622921636 and parameters: {'use_label_encoder': False, 'n_estimators': 1400, 'learning_rate': 0.24142300932662095, 'subsample': 0.69, 'colsample_bytree': 0.37, 'max_depth': 20, 'gamma': 0.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 25.834828425039532, 'reg_alpha': 1.336143342896779e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.45957348060610775}. Best is trial 2 with value: 675995.5290580511.[0m


Number of boosting rounds: 450


[32m[I 2023-02-20 18:12:54,810][0m Trial 11 finished with value: 688933.2985280649 and parameters: {'use_label_encoder': False, 'n_estimators': 1100, 'learning_rate': 0.2353073124553338, 'subsample': 0.66, 'colsample_bytree': 0.35, 'max_depth': 20, 'gamma': 6.4, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 14.662688799477486, 'reg_alpha': 1.8539392339069933e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.6832642038576354}. Best is trial 11 with value: 688933.2985280649.[0m


Number of boosting rounds: 517


[32m[I 2023-02-20 18:15:40,125][0m Trial 12 finished with value: 1876603.787486542 and parameters: {'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.15282433238630894, 'subsample': 0.7, 'colsample_bytree': 0.4, 'max_depth': 19, 'gamma': 2.0, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.03225966397805394, 'reg_alpha': 6.973864756231836e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.1922077239517475}. Best is trial 12 with value: 1876603.787486542.[0m


Number of boosting rounds: 210


[32m[I 2023-02-20 18:18:19,771][0m Trial 13 finished with value: 361484.23283992964 and parameters: {'use_label_encoder': False, 'n_estimators': 1000, 'learning_rate': 0.24709264864651675, 'subsample': 0.67, 'colsample_bytree': 0.38, 'max_depth': 16, 'gamma': 53.7, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 53.06100276964114, 'reg_alpha': 1.8705104098834182e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.7976819747172147}. Best is trial 12 with value: 1876603.787486542.[0m


Number of boosting rounds: 481


[32m[I 2023-02-20 18:27:46,978][0m Trial 14 finished with value: 1342719.0569601997 and parameters: {'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.06597176551311118, 'subsample': 0.66, 'colsample_bytree': 0.39, 'max_depth': 17, 'gamma': 2.2, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.06762653418887472, 'reg_alpha': 2.7780300200511238e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.31289232807600886}. Best is trial 12 with value: 1876603.787486542.[0m


Number of boosting rounds: 670
Number of boosting rounds: 1493


[32m[I 2023-02-20 18:34:02,393][0m Trial 15 finished with value: 1257896.506042855 and parameters: {'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.056549788653477415, 'subsample': 0.8, 'colsample_bytree': 0.22999999999999998, 'max_depth': 17, 'gamma': 45.6, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.01168279297643752, 'reg_alpha': 0.004856038301208466, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.1298092477108786}. Best is trial 12 with value: 1876603.787486542.[0m


Number of boosting rounds: 2091


[32m[I 2023-02-20 18:39:37,300][0m Trial 16 finished with value: 657608.5581896893 and parameters: {'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.010489877413474269, 'subsample': 0.5700000000000001, 'colsample_bytree': 0.49, 'max_depth': 17, 'gamma': 65.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.006923663268454306, 'reg_alpha': 4.464972563905349e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 3.6624862074317632}. Best is trial 12 with value: 1876603.787486542.[0m
[32m[I 2023-02-20 18:39:42,542][0m Trial 17 finished with value: 228215.45825810157 and parameters: {'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.09365439925169201, 'subsample': 0.74, 'colsample_bytree': 0.5, 'max_depth': 14, 'gamma': 0.7000000000000001, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.8376255940124964, 'reg_alpha': 3.2506734067398845e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 940.2492156244732}.

Number of boosting rounds: 1358


[32m[I 2023-02-20 18:41:03,059][0m Trial 18 finished with value: 466823.1408874633 and parameters: {'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.05933333089111779, 'subsample': 0.61, 'colsample_bytree': 0.29, 'max_depth': 10, 'gamma': 36.9, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.13507819154239814, 'reg_alpha': 1.0856821401021651e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.30521794411537145}. Best is trial 12 with value: 1876603.787486542.[0m


Number of boosting rounds: 2499
Number of boosting rounds: 3799


[32m[I 2023-02-20 18:48:42,270][0m Trial 19 finished with value: 884026.6126183475 and parameters: {'use_label_encoder': False, 'n_estimators': 3800, 'learning_rate': 0.04363739174578454, 'subsample': 1.0, 'colsample_bytree': 0.16, 'max_depth': 18, 'gamma': 60.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0048281518349195095, 'reg_alpha': 1.0678361187904702e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.26677960990684485}. Best is trial 12 with value: 1876603.787486542.[0m
[32m[I 2023-02-20 18:50:30,165][0m Trial 20 finished with value: 1174839.9932451244 and parameters: {'use_label_encoder': False, 'n_estimators': 3500, 'learning_rate': 0.09533513912007333, 'subsample': 0.77, 'colsample_bytree': 0.44, 'max_depth': 14, 'gamma': 24.8, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0012136122456765759, 'reg_alpha': 4.71800637077768e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.1207119546380333}. Best is trial 12 with v

Number of boosting rounds: 587
Number of boosting rounds: 1669


[32m[I 2023-02-20 18:56:56,353][0m Trial 21 finished with value: 1358263.4945121133 and parameters: {'use_label_encoder': False, 'n_estimators': 1900, 'learning_rate': 0.05774465826729088, 'subsample': 0.82, 'colsample_bytree': 0.24, 'max_depth': 18, 'gamma': 38.800000000000004, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.02618341306637573, 'reg_alpha': 0.006401246615729677, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.22965508226224202}. Best is trial 12 with value: 1876603.787486542.[0m
[32m[I 2023-02-20 19:01:43,173][0m Trial 22 finished with value: 1207057.6931651635 and parameters: {'use_label_encoder': False, 'n_estimators': 1800, 'learning_rate': 0.06456257538145885, 'subsample': 0.9, 'colsample_bytree': 0.24, 'max_depth': 16, 'gamma': 41.1, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.04400571623858099, 'reg_alpha': 0.0010395236487808007, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.23667395058545768}. Best is tria

Number of boosting rounds: 1677
Number of boosting rounds: 1297


[32m[I 2023-02-20 19:10:19,313][0m Trial 23 finished with value: 1264857.0102369813 and parameters: {'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.0398501330196002, 'subsample': 0.74, 'colsample_bytree': 0.28, 'max_depth': 18, 'gamma': 75.10000000000001, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.46107221665271364, 'reg_alpha': 0.00960130684036523, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.10049639121702138}. Best is trial 12 with value: 1876603.787486542.[0m
[32m[I 2023-02-20 19:11:05,677][0m Trial 24 finished with value: 569053.5072613142 and parameters: {'use_label_encoder': False, 'n_estimators': 1800, 'learning_rate': 0.12579038805272424, 'subsample': 0.51, 'colsample_bytree': 0.16999999999999998, 'max_depth': 12, 'gamma': 9.200000000000001, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.02144739676447968, 'reg_alpha': 8.404948959692317e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.4298485

Number of boosting rounds: 1795


[32m[I 2023-02-20 19:13:36,439][0m Trial 25 finished with value: 861127.6121734089 and parameters: {'use_label_encoder': False, 'n_estimators': 1500, 'learning_rate': 0.07522507285031747, 'subsample': 0.6, 'colsample_bytree': 0.5800000000000001, 'max_depth': 18, 'gamma': 95.60000000000001, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.17524149163554306, 'reg_alpha': 5.151549029512093e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.652725654106694}. Best is trial 12 with value: 1876603.787486542.[0m


Number of boosting rounds: 264
Number of boosting rounds: 1641


[32m[I 2023-02-20 19:20:17,129][0m Trial 26 finished with value: 458035.09077317355 and parameters: {'use_label_encoder': False, 'n_estimators': 2300, 'learning_rate': 0.05143551147356255, 'subsample': 0.83, 'colsample_bytree': 0.42, 'max_depth': 15, 'gamma': 18.3, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 4.218010209314752, 'reg_alpha': 1.5017933185426808e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.5250693408089812}. Best is trial 12 with value: 1876603.787486542.[0m
[32m[I 2023-02-20 19:28:23,663][0m Trial 27 finished with value: 1492938.1579700015 and parameters: {'use_label_encoder': False, 'n_estimators': 2800, 'learning_rate': 0.07444185090851187, 'subsample': 0.71, 'colsample_bytree': 0.33, 'max_depth': 19, 'gamma': 6.0, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.1777960864144413, 'reg_alpha': 1.349936421856308e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.19328362669868349}. Best is trial 12 with value:

Number of boosting rounds: 558


[32m[I 2023-02-20 19:32:51,010][0m Trial 28 finished with value: 827734.1495153314 and parameters: {'use_label_encoder': False, 'n_estimators': 2800, 'learning_rate': 0.14157492867740157, 'subsample': 0.9, 'colsample_bytree': 0.31, 'max_depth': 19, 'gamma': 28.200000000000003, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 6.179085975337225, 'reg_alpha': 0.00028639370109509617, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.17253226581276992}. Best is trial 12 with value: 1876603.787486542.[0m


Number of boosting rounds: 807


[32m[I 2023-02-20 19:34:08,638][0m Trial 29 finished with value: 221221.62714673227 and parameters: {'use_label_encoder': False, 'n_estimators': 3400, 'learning_rate': 0.07883826196334841, 'subsample': 0.72, 'colsample_bytree': 0.9900000000000001, 'max_depth': 12, 'gamma': 80.7, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.3643605771681833, 'reg_alpha': 2.001163733370578e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.19273283868185553}. Best is trial 12 with value: 1876603.787486542.[0m
[32m[I 2023-02-20 19:34:08,648][0m A new study created in memory with name: no-name-0761ff86-36d9-4e9d-a25a-f94853671495[0m


Number of boosting rounds: 626
Number of finished trials: 30
Best trial parameters: {'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.15282433238630894, 'subsample': 0.7, 'colsample_bytree': 0.4, 'max_depth': 19, 'gamma': 2.0, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.03225966397805394, 'reg_alpha': 6.973864756231836e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.1922077239517475}
Best score: 1876603.787486542
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 631901	training's rmse: 752189	valid_1's l1: 705318	valid_1's rmse: 838070
[1000]	training's l1: 220854	training's rmse: 298361	valid_1's l1: 303025	valid_1's rmse: 396015
Did not meet early stopping. Best iteration is:
[1492]	training's l1: 120771	training's rmse: 195547	valid_1's l1: 194252	valid_1's rmse: 284025


[32m[I 2023-02-20 19:34:51,847][0m Trial 0 finished with value: 284025.4441964572 and parameters: {'objective': 'rmse', 'n_estimators': 1492, 'reg_alpha': 3.350021546872966, 'reg_lambda': 5.992574012267941e-07, 'colsample_bytree': 0.27, 'num_leaves': 936, 'feature_fraction': 0.14169383575845282, 'bagging_fraction': 0.9802823778180653, 'bagging_freq': 2, 'min_child_samples': 67, 'subsample': 0.62, 'learning_rate': 0.01434295101889302, 'max_depth': 85, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 284025.4441964572.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 25396	training's rmse: 154165	valid_1's l1: 28403.2	valid_1's rmse: 172409
[1000]	training's l1: 23466.4	training's rmse: 148506	valid_1's l1: 28388.7	valid_1's rmse: 171500
Early stopping, best iteration is:
[627]	training's l1: 24247	training's rmse: 152532	valid_1's l1: 27763.9	valid_1's rmse: 172380


[32m[I 2023-02-20 19:35:08,355][0m Trial 1 finished with value: 172379.60347853668 and parameters: {'objective': 'rmse', 'n_estimators': 4231, 'reg_alpha': 1.303825184819966, 'reg_lambda': 1.7320642388249332e-06, 'colsample_bytree': 0.5700000000000001, 'num_leaves': 613, 'feature_fraction': 0.9210456184630645, 'bagging_fraction': 0.4769903788155304, 'bagging_freq': 3, 'min_child_samples': 102, 'subsample': 0.69, 'learning_rate': 0.017434151980318766, 'max_depth': 76, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 284025.4441964572.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 100969	training's rmse: 194994	valid_1's l1: 105839	valid_1's rmse: 208722
[1000]	training's l1: 46999.8	training's rmse: 163306	valid_1's l1: 50175.1	valid_1's rmse: 179104
[1500]	training's l1: 38648.2	training's rmse: 159289	valid_1's l1: 41750.3	valid_1's rmse: 176228
[2000]	training's l1: 34818.7	training's rmse: 157139	valid_1's l1: 38220.7	valid_1's rmse: 174992
[2500]	training's l1: 32799.5	training's rmse: 155870	valid_1's l1: 36287.4	valid_1's rmse: 174343
[3000]	training's l1: 31749.7	training's rmse: 154609	valid_1's l1: 35529.1	valid_1's rmse: 173842
Did not meet early stopping. Best iteration is:
[3312]	training's l1: 31378.9	training's rmse: 153903	valid_1's l1: 35222.1	valid_1's rmse: 173738


[32m[I 2023-02-20 19:35:28,174][0m Trial 2 finished with value: 173738.09519262728 and parameters: {'objective': 'rmse', 'n_estimators': 3330, 'reg_alpha': 5.877449132980082e-06, 'reg_lambda': 1.2183569607924702e-07, 'colsample_bytree': 0.81, 'num_leaves': 369, 'feature_fraction': 0.272579585595473, 'bagging_fraction': 0.2391595646705794, 'bagging_freq': 6, 'min_child_samples': 164, 'subsample': 0.98, 'learning_rate': 0.01632969664410445, 'max_depth': 56, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 284025.4441964572.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 76473.9	valid_1's l1: 104769
[1000]	training's l1: 65244.6	valid_1's l1: 100427
[1500]	training's l1: 60204.1	valid_1's l1: 98758.4
[2000]	training's l1: 57275.1	valid_1's l1: 97816.2
[2500]	training's l1: 55338.7	valid_1's l1: 97254.3
[3000]	training's l1: 53927.4	valid_1's l1: 96814
[3500]	training's l1: 52866.7	valid_1's l1: 96531.4
[4000]	training's l1: 52097.9	valid_1's l1: 96302.9
Did not meet early stopping. Best iteration is:
[4468]	training's l1: 51492.4	valid_1's l1: 96125


[32m[I 2023-02-20 19:36:09,838][0m Trial 3 finished with value: 206439.35870094216 and parameters: {'objective': 'mae', 'n_estimators': 4468, 'reg_alpha': 2.298279595539354, 'reg_lambda': 0.18836425601854226, 'colsample_bytree': 0.76, 'num_leaves': 471, 'feature_fraction': 0.4923876959164555, 'bagging_fraction': 0.5597048829944596, 'bagging_freq': 10, 'min_child_samples': 233, 'subsample': 0.42000000000000004, 'learning_rate': 0.17982925847964665, 'max_depth': 43, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 284025.4441964572.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 317778	valid_1's l1: 334582
Did not meet early stopping. Best iteration is:
[991]	training's l1: 93731.5	valid_1's l1: 115373


[32m[I 2023-02-20 19:36:21,073][0m Trial 4 finished with value: 223580.64002772488 and parameters: {'objective': 'mae', 'n_estimators': 991, 'reg_alpha': 0.005335750379376089, 'reg_lambda': 9.518967436451007, 'colsample_bytree': 0.8, 'num_leaves': 323, 'feature_fraction': 0.29930343993471886, 'bagging_fraction': 0.18746127175549865, 'bagging_freq': 7, 'min_child_samples': 32, 'subsample': 0.75, 'learning_rate': 0.014687244750754959, 'max_depth': 47, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 284025.4441964572.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 204551	training's rmse: 287101	valid_1's l1: 212645	valid_1's rmse: 301389
[1000]	training's l1: 70433.3	training's rmse: 174453	valid_1's l1: 75539.6	valid_1's rmse: 191218
[1500]	training's l1: 50484.4	training's rmse: 163993	valid_1's l1: 54357.9	valid_1's rmse: 181279
[2000]	training's l1: 44587.2	training's rmse: 160721	valid_1's l1: 48140.4	valid_1's rmse: 178627
[2500]	training's l1: 42236.2	training's rmse: 158598	valid_1's l1: 45930.6	valid_1's rmse: 177320
[3000]	training's l1: 39938.3	training's rmse: 157294	valid_1's l1: 43594.4	valid_1's rmse: 176773
[3500]	training's l1: 38498.9	training's rmse: 156213	valid_1's l1: 42274.2	valid_1's rmse: 176208
[4000]	training's l1: 37184.7	training's rmse: 155388	valid_1's l1: 41051	valid_1's rmse: 175879
[4500]	training's l1: 36762	training's rmse: 154419	valid_1's l1: 40837.2	valid_1's rmse: 175497
Did not meet early stopping. Best iteration is:
[4961]

[32m[I 2023-02-20 19:36:45,952][0m Trial 5 finished with value: 175465.09018486863 and parameters: {'objective': 'rmse', 'n_estimators': 4961, 'reg_alpha': 1.0842469550195409e-08, 'reg_lambda': 1.0078019981468055e-08, 'colsample_bytree': 0.44, 'num_leaves': 364, 'feature_fraction': 0.1179908979861403, 'bagging_fraction': 0.25207952842914705, 'bagging_freq': 9, 'min_child_samples': 195, 'subsample': 0.29000000000000004, 'learning_rate': 0.028858916220201845, 'max_depth': 21, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 284025.4441964572.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 69227.7	valid_1's l1: 85403.8
[1000]	training's l1: 58669.3	valid_1's l1: 78927.9
[1500]	training's l1: 52867	valid_1's l1: 76376.7
[2000]	training's l1: 49225.1	valid_1's l1: 75031.7
[2500]	training's l1: 46430.4	valid_1's l1: 74145
[3000]	training's l1: 44314.5	valid_1's l1: 73475.5
Did not meet early stopping. Best iteration is:
[3231]	training's l1: 43368.7	valid_1's l1: 73141.6


[32m[I 2023-02-20 19:37:04,797][0m Trial 6 finished with value: 192524.2259870729 and parameters: {'objective': 'mae', 'n_estimators': 3231, 'reg_alpha': 1.0689806179637239e-07, 'reg_lambda': 0.0028256853031310997, 'colsample_bytree': 0.67, 'num_leaves': 219, 'feature_fraction': 0.6656455671052418, 'bagging_fraction': 0.27159236129166175, 'bagging_freq': 12, 'min_child_samples': 245, 'subsample': 0.8099999999999999, 'learning_rate': 0.22110791700880128, 'max_depth': 29, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 284025.4441964572.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 67596.5	valid_1's l1: 174777
[1000]	training's l1: 57486.5	valid_1's l1: 172585
[1500]	training's l1: 53275.8	valid_1's l1: 171824
[2000]	training's l1: 50721.8	valid_1's l1: 171358
[2500]	training's l1: 49010.3	valid_1's l1: 170970
[3000]	training's l1: 47759.6	valid_1's l1: 170758
Did not meet early stopping. Best iteration is:
[3033]	training's l1: 47687.8	valid_1's l1: 170732


[32m[I 2023-02-20 19:38:05,250][0m Trial 7 finished with value: 274233.28938419814 and parameters: {'objective': 'mae', 'n_estimators': 3033, 'reg_alpha': 2.7105904010989545e-08, 'reg_lambda': 0.0005311874428199831, 'colsample_bytree': 0.8, 'num_leaves': 146, 'feature_fraction': 0.660446824911452, 'bagging_fraction': 0.930677234683695, 'bagging_freq': 15, 'min_child_samples': 29, 'subsample': 0.55, 'learning_rate': 0.1689453371299225, 'max_depth': 31, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 284025.4441964572.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 85399.4	training's rmse: 140777	valid_1's l1: 256848	valid_1's rmse: 355655
[1000]	training's l1: 41447.3	training's rmse: 86097.6	valid_1's l1: 258072	valid_1's rmse: 356489
Early stopping, best iteration is:
[604]	training's l1: 72329.4	training's rmse: 124563	valid_1's l1: 256286	valid_1's rmse: 355169


[32m[I 2023-02-20 19:38:29,073][0m Trial 8 finished with value: 355168.55904455617 and parameters: {'objective': 'rmse', 'n_estimators': 1963, 'reg_alpha': 0.0015005587283589538, 'reg_lambda': 7.280813762466515e-07, 'colsample_bytree': 0.9, 'num_leaves': 640, 'feature_fraction': 0.4910185677102551, 'bagging_fraction': 0.2831020631583079, 'bagging_freq': 2, 'min_child_samples': 30, 'subsample': 0.27, 'learning_rate': 0.10960996800916917, 'max_depth': 78, 'random_state': 42, 'n_jobs': 4}. Best is trial 8 with value: 355168.55904455617.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 53425.5	training's rmse: 159253	valid_1's l1: 65347.5	valid_1's rmse: 185117
[1000]	training's l1: 40076.2	training's rmse: 147448	valid_1's l1: 53927.7	valid_1's rmse: 178577
[1500]	training's l1: 35332.5	training's rmse: 140668	valid_1's l1: 52103.6	valid_1's rmse: 177027
[2000]	training's l1: 32440.3	training's rmse: 135338	valid_1's l1: 51713.4	valid_1's rmse: 176270
[2500]	training's l1: 30251.1	training's rmse: 130534	valid_1's l1: 51945.2	valid_1's rmse: 176082
Early stopping, best iteration is:
[2089]	training's l1: 31997.8	training's rmse: 134520	valid_1's l1: 51616.9	valid_1's rmse: 176196


[32m[I 2023-02-20 19:39:10,433][0m Trial 9 finished with value: 176196.35616039266 and parameters: {'objective': 'rmse', 'n_estimators': 2973, 'reg_alpha': 9.446425718178307e-07, 'reg_lambda': 0.00013848876882846468, 'colsample_bytree': 0.4, 'num_leaves': 997, 'feature_fraction': 0.7740243847112528, 'bagging_fraction': 0.586206117667554, 'bagging_freq': 6, 'min_child_samples': 116, 'subsample': 0.42000000000000004, 'learning_rate': 0.01746699019046584, 'max_depth': 78, 'random_state': 42, 'n_jobs': 4}. Best is trial 8 with value: 355168.55904455617.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 43142.9	training's rmse: 164225	valid_1's l1: 43865.9	valid_1's rmse: 177977
[1000]	training's l1: 37325.3	training's rmse: 161304	valid_1's l1: 38185.6	valid_1's rmse: 175754
[1500]	training's l1: 36274.3	training's rmse: 160115	valid_1's l1: 37326.8	valid_1's rmse: 175247
Did not meet early stopping. Best iteration is:
[1849]	training's l1: 35980.7	training's rmse: 159535	valid_1's l1: 37054.7	valid_1's rmse: 174999


[32m[I 2023-02-20 19:39:14,401][0m Trial 10 finished with value: 174999.00047231375 and parameters: {'objective': 'rmse', 'n_estimators': 1850, 'reg_alpha': 0.0003000675317724044, 'reg_lambda': 9.175911894129963e-06, 'colsample_bytree': 0.9700000000000001, 'num_leaves': 709, 'feature_fraction': 0.4605944747842526, 'bagging_fraction': 0.11166435596358282, 'bagging_freq': 0, 'min_child_samples': 291, 'subsample': 0.11, 'learning_rate': 0.07368919450561545, 'max_depth': 2, 'random_state': 42, 'n_jobs': 4}. Best is trial 8 with value: 355168.55904455617.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 91107.4	training's rmse: 165990	valid_1's l1: 146997	valid_1's rmse: 239759
[1000]	training's l1: 63647.6	training's rmse: 132027	valid_1's l1: 121976	valid_1's rmse: 217352
[1500]	training's l1: 52026.9	training's rmse: 113136	valid_1's l1: 115466	valid_1's rmse: 211485
Did not meet early stopping. Best iteration is:
[1830]	training's l1: 48145.4	training's rmse: 104975	valid_1's l1: 114128	valid_1's rmse: 210087


[32m[I 2023-02-20 19:40:00,592][0m Trial 11 finished with value: 210087.25988724804 and parameters: {'objective': 'rmse', 'n_estimators': 1831, 'reg_alpha': 0.05349068393276881, 'reg_lambda': 6.737852072052246e-07, 'colsample_bytree': 0.15000000000000002, 'num_leaves': 908, 'feature_fraction': 0.14534920597583684, 'bagging_fraction': 0.997658259997261, 'bagging_freq': 0, 'min_child_samples': 72, 'subsample': 0.12000000000000001, 'learning_rate': 0.06075731450891817, 'max_depth': 100, 'random_state': 42, 'n_jobs': 4}. Best is trial 8 with value: 355168.55904455617.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 12180.7	training's rmse: 16493.7	valid_1's l1: 227004	valid_1's rmse: 312579
[1000]	training's l1: 1731.22	training's rmse: 2355.76	valid_1's l1: 226451	valid_1's rmse: 312054
[1500]	training's l1: 289.974	training's rmse: 404.361	valid_1's l1: 226415	valid_1's rmse: 312020
Did not meet early stopping. Best iteration is:
[1810]	training's l1: 97.9853	training's rmse: 140.297	valid_1's l1: 226411	valid_1's rmse: 312016


[32m[I 2023-02-20 19:42:34,257][0m Trial 12 finished with value: 312016.0129607269 and parameters: {'objective': 'rmse', 'n_estimators': 1810, 'reg_alpha': 7.4020153858728035, 'reg_lambda': 2.7711136416922725e-08, 'colsample_bytree': 0.19, 'num_leaves': 795, 'feature_fraction': 0.3706961442687041, 'bagging_fraction': 0.8163650391734913, 'bagging_freq': 3, 'min_child_samples': 4, 'subsample': 0.5700000000000001, 'learning_rate': 0.03998177495554658, 'max_depth': 95, 'random_state': 42, 'n_jobs': 4}. Best is trial 8 with value: 355168.55904455617.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 1988.49	training's rmse: 4874.48	valid_1's l1: 245560	valid_1's rmse: 337087
Early stopping, best iteration is:
[244]	training's l1: 15883.6	training's rmse: 28817.5	valid_1's l1: 245298	valid_1's rmse: 336763


[32m[I 2023-02-20 19:43:30,950][0m Trial 13 finished with value: 336763.06554615067 and parameters: {'objective': 'rmse', 'n_estimators': 2378, 'reg_alpha': 0.00020454757720503338, 'reg_lambda': 1.3165983878151718e-08, 'colsample_bytree': 0.05, 'num_leaves': 741, 'feature_fraction': 0.35020783075919626, 'bagging_fraction': 0.7777436455031658, 'bagging_freq': 4, 'min_child_samples': 9, 'subsample': 0.37, 'learning_rate': 0.10555772529106239, 'max_depth': 96, 'random_state': 42, 'n_jobs': 4}. Best is trial 8 with value: 355168.55904455617.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 2672.83	training's rmse: 3559.24	valid_1's l1: 393910	valid_1's rmse: 507126
[1000]	training's l1: 46.2907	training's rmse: 64.621	valid_1's l1: 393864	valid_1's rmse: 507078
[1500]	training's l1: 0.867987	training's rmse: 1.31696	valid_1's l1: 393864	valid_1's rmse: 507077
Early stopping, best iteration is:
[1385]	training's l1: 2.15071	training's rmse: 3.1629	valid_1's l1: 393864	valid_1's rmse: 507077


[32m[I 2023-02-20 19:45:32,121][0m Trial 14 finished with value: 507077.01586175483 and parameters: {'objective': 'rmse', 'n_estimators': 2313, 'reg_alpha': 0.00017186350590354265, 'reg_lambda': 1.4187489646146999e-08, 'colsample_bytree': 1.0, 'num_leaves': 592, 'feature_fraction': 0.5693036321575009, 'bagging_fraction': 0.7415859653314874, 'bagging_freq': 4, 'min_child_samples': 4, 'subsample': 0.31, 'learning_rate': 0.0989621135954855, 'max_depth': 66, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 507077.01586175483.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 65618.5	training's rmse: 134762	valid_1's l1: 148893	valid_1's rmse: 242184
[1000]	training's l1: 39217	training's rmse: 99186.4	valid_1's l1: 150438	valid_1's rmse: 243014
Early stopping, best iteration is:
[507]	training's l1: 65014.2	training's rmse: 134219	valid_1's l1: 148723	valid_1's rmse: 242085


[32m[I 2023-02-20 19:45:50,134][0m Trial 15 finished with value: 242085.3805869858 and parameters: {'objective': 'rmse', 'n_estimators': 2398, 'reg_alpha': 3.704492502754765e-05, 'reg_lambda': 1.4934392226902266e-05, 'colsample_bytree': 1.0, 'num_leaves': 570, 'feature_fraction': 0.5656985483054746, 'bagging_fraction': 0.40792069909598366, 'bagging_freq': 4, 'min_child_samples': 60, 'subsample': 0.22, 'learning_rate': 0.10418790127082259, 'max_depth': 63, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 507077.01586175483.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 53098.7	training's rmse: 125720	valid_1's l1: 117908	valid_1's rmse: 217622
Did not meet early stopping. Best iteration is:
[726]	training's l1: 42405.3	training's rmse: 109151	valid_1's l1: 117116	valid_1's rmse: 215956


[32m[I 2023-02-20 19:46:02,288][0m Trial 16 finished with value: 215956.34811412083 and parameters: {'objective': 'rmse', 'n_estimators': 726, 'reg_alpha': 0.006814936440408787, 'reg_lambda': 1.4556209135589585e-07, 'colsample_bytree': 0.92, 'num_leaves': 541, 'feature_fraction': 0.5669806029096377, 'bagging_fraction': 0.6588949283625413, 'bagging_freq': 1, 'min_child_samples': 110, 'subsample': 0.26, 'learning_rate': 0.10632422697690683, 'max_depth': 70, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 507077.01586175483.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 66807.7	training's rmse: 161923	valid_1's l1: 83575.8	valid_1's rmse: 194958
[1000]	training's l1: 54758.7	training's rmse: 146987	valid_1's l1: 78597.4	valid_1's rmse: 191548
[1500]	training's l1: 48442.7	training's rmse: 136592	valid_1's l1: 78019.2	valid_1's rmse: 190260
[2000]	training's l1: 44203.7	training's rmse: 128699	valid_1's l1: 78045.6	valid_1's rmse: 189443
Early stopping, best iteration is:
[1755]	training's l1: 46117.2	training's rmse: 132631	valid_1's l1: 77569.4	valid_1's rmse: 189860


[32m[I 2023-02-20 19:46:22,489][0m Trial 17 finished with value: 189859.7873851047 and parameters: {'objective': 'rmse', 'n_estimators': 2439, 'reg_alpha': 4.761210099605011e-05, 'reg_lambda': 8.715542093280212e-08, 'colsample_bytree': 0.63, 'num_leaves': 638, 'feature_fraction': 0.44811660544480286, 'bagging_fraction': 0.3940287212947585, 'bagging_freq': 5, 'min_child_samples': 157, 'subsample': 0.36, 'learning_rate': 0.07579243025492952, 'max_depth': 62, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 507077.01586175483.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 71465.9	valid_1's l1: 163730
[1000]	training's l1: 58761	valid_1's l1: 159782
[1500]	training's l1: 53987.2	valid_1's l1: 158340
[2000]	training's l1: 51252.2	valid_1's l1: 157644
[2500]	training's l1: 49496.4	valid_1's l1: 157172
[3000]	training's l1: 48229.1	valid_1's l1: 156823
[3500]	training's l1: 47293.9	valid_1's l1: 156589
Did not meet early stopping. Best iteration is:
[3736]	training's l1: 46919.3	valid_1's l1: 156497


[32m[I 2023-02-20 19:48:34,077][0m Trial 18 finished with value: 253304.3147706665 and parameters: {'objective': 'mae', 'n_estimators': 3736, 'reg_alpha': 0.002904296207016766, 'reg_lambda': 4.190236055819124e-06, 'colsample_bytree': 0.89, 'num_leaves': 832, 'feature_fraction': 0.6520518317401037, 'bagging_fraction': 0.6420553393946675, 'bagging_freq': 8, 'min_child_samples': 43, 'subsample': 0.48, 'learning_rate': 0.05075260930473391, 'max_depth': 86, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 507077.01586175483.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 42215.2	training's rmse: 54750.1	valid_1's l1: 97835.5	valid_1's rmse: 215269


[32m[I 2023-02-20 19:48:41,531][0m Trial 19 finished with value: 215006.39522735777 and parameters: {'objective': 'rmse', 'n_estimators': 1300, 'reg_alpha': 4.413002909406273e-06, 'reg_lambda': 2.93280868357267e-07, 'colsample_bytree': 0.68, 'num_leaves': 59, 'feature_fraction': 0.7936040119900135, 'bagging_fraction': 0.3585549831428172, 'bagging_freq': 2, 'min_child_samples': 2, 'subsample': 0.17, 'learning_rate': 0.13851749491364757, 'max_depth': 64, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 507077.01586175483.[0m


Early stopping, best iteration is:
[312]	training's l1: 54298	training's rmse: 72577.4	valid_1's l1: 97112.3	valid_1's rmse: 215006
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 76025.5	training's rmse: 121110	valid_1's l1: 230569	valid_1's rmse: 326951
Early stopping, best iteration is:
[312]	training's l1: 105725	training's rmse: 159858	valid_1's l1: 227721	valid_1's rmse: 324159


[32m[I 2023-02-20 19:48:53,264][0m Trial 20 finished with value: 324159.38644392544 and parameters: {'objective': 'rmse', 'n_estimators': 2526, 'reg_alpha': 0.001107683515103244, 'reg_lambda': 5.590090608121368e-08, 'colsample_bytree': 0.8800000000000001, 'num_leaves': 472, 'feature_fraction': 0.5195904737419548, 'bagging_fraction': 0.47617150660353774, 'bagging_freq': 11, 'min_child_samples': 87, 'subsample': 0.33, 'learning_rate': 0.2474562577998072, 'max_depth': 84, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 507077.01586175483.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 1134.14	training's rmse: 1475.62	valid_1's l1: 251438	valid_1's rmse: 346955
[1000]	training's l1: 12.6371	training's rmse: 17.4936	valid_1's l1: 251432	valid_1's rmse: 346949
Early stopping, best iteration is:
[629]	training's l1: 355.454	training's rmse: 467.165	valid_1's l1: 251430	valid_1's rmse: 346948


[32m[I 2023-02-20 19:50:17,351][0m Trial 21 finished with value: 346947.950637561 and parameters: {'objective': 'rmse', 'n_estimators': 2199, 'reg_alpha': 0.00016748987379039293, 'reg_lambda': 2.5959847397001927e-08, 'colsample_bytree': 0.33999999999999997, 'num_leaves': 725, 'feature_fraction': 0.3958316252123791, 'bagging_fraction': 0.7719383173295585, 'bagging_freq': 4, 'min_child_samples': 3, 'subsample': 0.41000000000000003, 'learning_rate': 0.10959669160753137, 'max_depth': 91, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 507077.01586175483.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 18813.2	training's rmse: 62905.2	valid_1's l1: 143919	valid_1's rmse: 233840
Early stopping, best iteration is:
[195]	training's l1: 49761.5	training's rmse: 114391	valid_1's l1: 141209	valid_1's rmse: 234725


[32m[I 2023-02-20 19:50:43,009][0m Trial 22 finished with value: 234724.5031284261 and parameters: {'objective': 'rmse', 'n_estimators': 1891, 'reg_alpha': 0.00012228998750229796, 'reg_lambda': 1.1050577092215977e-08, 'colsample_bytree': 0.38, 'num_leaves': 692, 'feature_fraction': 0.4011722356558293, 'bagging_fraction': 0.7357100347037555, 'bagging_freq': 5, 'min_child_samples': 41, 'subsample': 0.45999999999999996, 'learning_rate': 0.13351752229509395, 'max_depth': 74, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 507077.01586175483.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 10996.1	training's rmse: 36904.6	valid_1's l1: 399209	valid_1's rmse: 509445
[1000]	training's l1: 1935.84	training's rmse: 11770.1	valid_1's l1: 399473	valid_1's rmse: 509850
Early stopping, best iteration is:
[510]	training's l1: 10498.6	training's rmse: 36004.2	valid_1's l1: 399169	valid_1's rmse: 509364


[32m[I 2023-02-20 19:51:53,542][0m Trial 23 finished with value: 509363.70658724464 and parameters: {'objective': 'rmse', 'n_estimators': 2700, 'reg_alpha': 0.000599103298257731, 'reg_lambda': 3.0040174328056107e-07, 'colsample_bytree': 0.5, 'num_leaves': 806, 'feature_fraction': 0.4151251951012078, 'bagging_fraction': 0.8556709748326068, 'bagging_freq': 2, 'min_child_samples': 24, 'subsample': 0.22, 'learning_rate': 0.08346471947257707, 'max_depth': 90, 'random_state': 42, 'n_jobs': 4}. Best is trial 23 with value: 509363.70658724464.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 29431.1	training's rmse: 81274.7	valid_1's l1: 193716	valid_1's rmse: 284554
[1000]	training's l1: 10158.6	training's rmse: 47619.6	valid_1's l1: 193890	valid_1's rmse: 283419
Early stopping, best iteration is:
[775]	training's l1: 15420.7	training's rmse: 59034.9	valid_1's l1: 193376	valid_1's rmse: 283115


[32m[I 2023-02-20 19:52:42,218][0m Trial 24 finished with value: 283114.8014683144 and parameters: {'objective': 'rmse', 'n_estimators': 2819, 'reg_alpha': 0.024618960698888415, 'reg_lambda': 1.0314596905089786e-06, 'colsample_bytree': 0.5700000000000001, 'num_leaves': 844, 'feature_fraction': 0.5376998282349992, 'bagging_fraction': 0.8389859406383403, 'bagging_freq': 2, 'min_child_samples': 51, 'subsample': 0.23, 'learning_rate': 0.08414372067118772, 'max_depth': 56, 'random_state': 42, 'n_jobs': 4}. Best is trial 23 with value: 509363.70658724464.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 58009.7	training's rmse: 132791	valid_1's l1: 116868	valid_1's rmse: 215232
[1000]	training's l1: 37940.2	training's rmse: 101312	valid_1's l1: 112391	valid_1's rmse: 209161
[1500]	training's l1: 27422.6	training's rmse: 81120.7	valid_1's l1: 112156	valid_1's rmse: 207952
Early stopping, best iteration is:
[1274]	training's l1: 31454.3	training's rmse: 89320.2	valid_1's l1: 111888	valid_1's rmse: 208137


[32m[I 2023-02-20 19:53:14,145][0m Trial 25 finished with value: 208137.13813501372 and parameters: {'objective': 'rmse', 'n_estimators': 2714, 'reg_alpha': 0.0012364348660758632, 'reg_lambda': 1.6202461870270135e-07, 'colsample_bytree': 0.47, 'num_leaves': 631, 'feature_fraction': 0.45289700184134674, 'bagging_fraction': 0.8696188166848876, 'bagging_freq': 1, 'min_child_samples': 124, 'subsample': 0.21000000000000002, 'learning_rate': 0.07034492693517404, 'max_depth': 70, 'random_state': 42, 'n_jobs': 4}. Best is trial 23 with value: 509363.70658724464.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 64601.1	valid_1's l1: 117531
[1000]	training's l1: 53594.3	valid_1's l1: 113098
[1500]	training's l1: 49427.8	valid_1's l1: 111597
[2000]	training's l1: 47281.6	valid_1's l1: 110895
[2500]	training's l1: 45864.6	valid_1's l1: 110420
[3000]	training's l1: 44869.4	valid_1's l1: 110112
[3500]	training's l1: 44095.2	valid_1's l1: 109867
Did not meet early stopping. Best iteration is:
[3603]	training's l1: 43959.7	valid_1's l1: 109824


[32m[I 2023-02-20 19:54:38,238][0m Trial 26 finished with value: 216020.0720348867 and parameters: {'objective': 'mae', 'n_estimators': 3603, 'reg_alpha': 2.570938183044617e-05, 'reg_lambda': 3.1410160574500114e-06, 'colsample_bytree': 0.7300000000000001, 'num_leaves': 486, 'feature_fraction': 0.5927244579893687, 'bagging_fraction': 0.7176329687112303, 'bagging_freq': 3, 'min_child_samples': 82, 'subsample': 0.30000000000000004, 'learning_rate': 0.05533769130164752, 'max_depth': 81, 'random_state': 42, 'n_jobs': 4}. Best is trial 23 with value: 509363.70658724464.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 36315.7	training's rmse: 88916.5	valid_1's l1: 187899	valid_1's rmse: 272688
[1000]	training's l1: 14857.2	training's rmse: 49523.2	valid_1's l1: 185081	valid_1's rmse: 268997
Early stopping, best iteration is:
[880]	training's l1: 18058.5	training's rmse: 56508.9	valid_1's l1: 184996	valid_1's rmse: 269077


[32m[I 2023-02-20 19:56:07,864][0m Trial 27 finished with value: 269076.8183138115 and parameters: {'objective': 'rmse', 'n_estimators': 1474, 'reg_alpha': 0.0006696783291243255, 'reg_lambda': 2.728592202748393e-05, 'colsample_bytree': 0.51, 'num_leaves': 792, 'feature_fraction': 0.2524806998147312, 'bagging_fraction': 0.9052201227351543, 'bagging_freq': 1, 'min_child_samples': 25, 'subsample': 0.16, 'learning_rate': 0.04167187899245906, 'max_depth': 87, 'random_state': 42, 'n_jobs': 4}. Best is trial 23 with value: 509363.70658724464.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 65140.4	training's rmse: 142648	valid_1's l1: 111392	valid_1's rmse: 211891
[1000]	training's l1: 46394.5	training's rmse: 113984	valid_1's l1: 109166	valid_1's rmse: 208619
[1500]	training's l1: 35683.6	training's rmse: 95261.2	valid_1's l1: 109390	valid_1's rmse: 208408
Early stopping, best iteration is:
[1112]	training's l1: 43338	training's rmse: 109468	valid_1's l1: 108720	valid_1's rmse: 208073


[32m[I 2023-02-20 19:56:36,442][0m Trial 28 finished with value: 208072.53391657802 and parameters: {'objective': 'rmse', 'n_estimators': 2151, 'reg_alpha': 0.02292178510051244, 'reg_lambda': 4.87752745836031e-07, 'colsample_bytree': 0.9600000000000001, 'num_leaves': 889, 'feature_fraction': 0.49277580776752605, 'bagging_fraction': 0.6388233808276048, 'bagging_freq': 6, 'min_child_samples': 133, 'subsample': 0.48, 'learning_rate': 0.08874147168316503, 'max_depth': 68, 'random_state': 42, 'n_jobs': 4}. Best is trial 23 with value: 509363.70658724464.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 27363.9	training's rmse: 64923.2	valid_1's l1: 234829	valid_1's rmse: 326730
[1000]	training's l1: 7617.73	training's rmse: 30902.7	valid_1's l1: 235224	valid_1's rmse: 326782
Early stopping, best iteration is:
[656]	training's l1: 17347	training's rmse: 49470.9	valid_1's l1: 234758	valid_1's rmse: 326457


[32m[I 2023-02-20 19:57:16,100][0m Trial 29 finished with value: 326457.49010323547 and parameters: {'objective': 'rmse', 'n_estimators': 1313, 'reg_alpha': 0.0006564666842286576, 'reg_lambda': 5.238298684302513e-07, 'colsample_bytree': 0.3, 'num_leaves': 965, 'feature_fraction': 0.4321217137609482, 'bagging_fraction': 0.9482635290957784, 'bagging_freq': 2, 'min_child_samples': 64, 'subsample': 0.29000000000000004, 'learning_rate': 0.1297702792026085, 'max_depth': 54, 'random_state': 42, 'n_jobs': 4}. Best is trial 23 with value: 509363.70658724464.[0m
[32m[I 2023-02-20 19:57:16,113][0m A new study created in memory with name: no-name-20e4ec17-0a94-493d-90e2-a6a3137cf00b[0m


Number of finished trials: 30
Best trial parameters: {'objective': 'rmse', 'n_estimators': 2700, 'reg_alpha': 0.000599103298257731, 'reg_lambda': 3.0040174328056107e-07, 'colsample_bytree': 0.5, 'num_leaves': 806, 'feature_fraction': 0.4151251951012078, 'bagging_fraction': 0.8556709748326068, 'bagging_freq': 2, 'min_child_samples': 24, 'subsample': 0.22, 'learning_rate': 0.08346471947257707, 'max_depth': 90, 'random_state': 42, 'n_jobs': 4}
Best score: 509363.70658724464


[32m[I 2023-02-20 19:57:17,027][0m Trial 0 finished with value: 171094.22462679425 and parameters: {'learning_rate': 0.1495954440852017, 'l2_leaf_reg': 5.159377510044407, 'bagging_temperature': 0.1980944390067433, 'random_strength': 1.3386686714307992, 'depth': 7, 'min_data_in_leaf': 59}. Best is trial 0 with value: 171094.22462679425.[0m
[32m[I 2023-02-20 19:57:17,503][0m Trial 1 finished with value: 182479.94886388283 and parameters: {'learning_rate': 0.102639358268506, 'l2_leaf_reg': 5.159112337281242, 'bagging_temperature': 3.2491106381007233, 'random_strength': 1.4828935750672931, 'depth': 4, 'min_data_in_leaf': 157}. Best is trial 1 with value: 182479.94886388283.[0m
[32m[I 2023-02-20 19:57:18,101][0m Trial 2 finished with value: 183571.03345247198 and parameters: {'learning_rate': 0.13444555448984127, 'l2_leaf_reg': 97.29713937917167, 'bagging_temperature': 10.180495132362122, 'random_strength': 1.1648084842435833, 'depth': 6, 'min_data_in_leaf': 124}. Best is trial 2 wi

Number of finished trials: 30
Best trial parameters: {'learning_rate': 0.45743264601999495, 'l2_leaf_reg': 41.338946049390074, 'bagging_temperature': 0.3472567739474319, 'random_strength': 1.7332249677756242, 'depth': 1, 'min_data_in_leaf': 6}
Best score: 296435.6534212222
CPU times: user 2h 12min 35s, sys: 8min 14s, total: 2h 20min 50s
Wall time: 2h 1min 24s


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [19]:
train = create_folds(train, Config.N_FOLDS)
# train = create_strat_folds(train, TARGET, Config.N_FOLDS)

n_folds=5, seed=42


In [20]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index(ID, inplace=True)
oof.head()

Unnamed: 0_level_0,price,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3436795.2,2
1,9519958.0,3
2,9276448.1,3
3,9725732.2,1
4,6181908.8,1


In [21]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [22]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [23]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [24]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 11 µs, sys: 0 ns, total: 11 µs
Wall time: 14.8 µs


In [25]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params = gpu_ify_lgbm(lgbm_params)
# if Config.gpu:
#     lgbm_params["device"] = "gpu"
#     lgbm_params["boosting_type"] = "gbdt"
#     lgbm_params["gpu_platform_id"] = 0
#     lgbm_params["gpu_device_id"] = 0

In [26]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror",
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [27]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)
# if Config.gpu:
#     cb_params["task_type"] = "GPU"

In [28]:
lgbm_params = {
    "n_estimators": Config.GPU_N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}

if Config.gpu:
    lgbm_params["device"] = "gpu"
    lgbm_params["boosting_type"] = "gbdt"
    lgbm_params["gpu_platform_id"] = 0
    lgbm_params["gpu_device_id"] = 0

In [29]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),

#     "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm1": lgb.LGBMRegressor(),
    "lgbm1": lgb.LGBMRegressor(),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),

}

## Tree Models

In [30]:
%%time

model_lst = ["xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
# model_lst = ["lgbm1"]
# model_lst = = []
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=xgb_best_params
{}
fold: 1, Score: 1553391.0526323328, Run Time: 168.42
fold: 2, Score: 1580669.853882261, Run Time: 206.01
fold: 3, Score: 1565235.1726674687, Run Time: 182.47
fold: 4, Score: 1582651.659557184, Run Time: 176.16
fold: 5, Score: 1570138.1392809958, Run Time: 211.29
Scores -> Adjusted: 1559724.93646878 , mean: 1570417.17560405, std: 10692.23913526

=== Model Feature Importance ===
attic 0.096934184
hasPool 0.09449263
squareMeters 0.08531257
basement 0.07558604
made 0.07291393
hasGuestRoom 0.07136008
is_original 0.06956717
isNewBuilt 0.069519065
hasStorageRoom 0.060691174
garage 0.054430082
hasYard 0.052781004
hasStormProtector 0.050565645
numPrevOwners 0.045684114
numberOfRooms 0.040322904
floors 0.030530512
cityPartRange 0.029308869


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0,4934937.0
1,6801636.0
2,5106223.5
3,7831515.0
4,5337987.0


Mode
=== Target Value Counts ===
Model Run Time: 947.14
Model=lgbm_best_params
{}
fold: 1, Score: 398158.56215760467, Run Time: 216.52
fold: 2, Score: 402911.5477302575, Run Time: 215.31
fold: 3, Score: 400633.8492429547, Run Time: 222.15
fold: 4, Score: 401075.8052344325, Run Time: 220.47
fold: 5, Score: 409016.9415879417, Run Time: 221.54
Scores -> Adjusted: 398701.47551215 , mean: 402359.34119064, std: 3657.86567849

=== Model Feature Importance ===
basement 0.12474603163020329
garage 0.1211214117357624
attic 0.117932538495298
squareMeters 0.11468111788823411
numberOfRooms 0.1005115332131729
floors 0.09645168789281049
made 0.07138322458778679
hasGuestRoom 0.05182726919239883
cityPartRange 0.050369394322575534
numPrevOwners 0.04865403279358431
isNewBuilt 0.01991039047278512
hasPool 0.018494735075416487
hasStormProtector 0.018273213152043267
hasYard 0.018116323507489526
hasStorageRoom 0.017426217562675067
is_original 0.01010087847776389


Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0,3474020.0
1,8901820.0
2,8513740.0
3,9176260.0
4,5856750.0


Mode
=== Target Value Counts ===
Model Run Time: 1096.76
Model=cat_best_params
{}
fold: 1, Score: 50448.25070081555, Run Time: 2.56
fold: 2, Score: 49976.05468304077, Run Time: 2.63
fold: 3, Score: 51220.50570192212, Run Time: 2.62
fold: 4, Score: 53042.29652157516, Run Time: 2.88
fold: 5, Score: 51315.14577593608, Run Time: 2.62
Scores -> Adjusted: 50154.41973647 , mean: 51200.45067666, std: 1046.03094018

=== Model Feature Importance ===
squareMeters 0.9999980029746096
garage 1.997025390334676e-06
is_original 0.0
hasGuestRoom 0.0
hasStorageRoom 0.0
attic 0.0
basement 0.0
hasStormProtector 0.0
isNewBuilt 0.0
made 0.0
numPrevOwners 0.0
cityPartRange 0.0
floors 0.0
hasPool 0.0
hasYard 0.0
numberOfRooms 0.0


Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0,3602740.0
1,9543730.0
2,9256350.0
3,9699200.0
4,6159700.0


Mode
=== Target Value Counts ===
Model Run Time: 13.83
Model=xgb1
{}
fold: 1, Score: 17577.262720405353, Run Time: 2.57
fold: 2, Score: 17193.8903175734, Run Time: 2.60
fold: 3, Score: 16674.26346570186, Run Time: 3.75
fold: 4, Score: 16629.915791579544, Run Time: 2.59
fold: 5, Score: 18430.308658984697, Run Time: 2.55
Scores -> Adjusted: 16636.85652382 , mean: 17301.12819085, std: 664.27166703

=== Model Feature Importance ===
squareMeters 0.98882294
numPrevOwners 0.004286385
numberOfRooms 0.0022135295
hasYard 0.0012227069
basement 0.0007427436
attic 0.00073237886
garage 0.00073111686
floors 0.00043429388
hasGuestRoom 0.00029682546
cityPartRange 0.00015362339
made 0.00014366602
hasPool 9.519807e-05
isNewBuilt 7.3293064e-05
hasStorageRoom 2.269525e-05
is_original 1.7690812e-05
hasStormProtector 1.0897483e-05


Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0,3429240.0
1,9518231.0
2,9268216.0
3,9764527.0
4,6179530.0


Mode
=== Target Value Counts ===
Model Run Time: 14.54
Model=xgb2
{}
fold: 1, Score: 143651.8151258208, Run Time: 72.05
fold: 2, Score: 146554.61114808155, Run Time: 74.43
fold: 3, Score: 146672.21880060717, Run Time: 74.72
fold: 4, Score: 145377.12701100862, Run Time: 74.44
fold: 5, Score: 147195.7679548516, Run Time: 75.07
Scores -> Adjusted: 144622.93955797 , mean: 145890.30800807, std: 1267.36845010

=== Model Feature Importance ===
squareMeters 0.8554658
is_original 0.028601384
made 0.022200009
floors 0.011429052
numberOfRooms 0.011045114
basement 0.010650473
hasGuestRoom 0.010544001
garage 0.009904542
attic 0.009711989
isNewBuilt 0.0048977644
cityPartRange 0.004784067
hasStormProtector 0.0047019855
hasPool 0.0046980544
numPrevOwners 0.0046805004
hasStorageRoom 0.0041002263
hasYard 0.0025850271


Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0,3501905.75
1,9326754.0
2,9134723.0
3,9436320.0
4,6268954.0


Mode
=== Target Value Counts ===
Model Run Time: 372.37
Model=lgbm1
{}
fold: 1, Score: 23557.021301756893, Run Time: 0.89
fold: 2, Score: 22927.351267612314, Run Time: 0.88
fold: 3, Score: 24380.1547534273, Run Time: 0.92
fold: 4, Score: 24043.87862298305, Run Time: 0.93
fold: 5, Score: 26139.38098780234, Run Time: 1.04
Scores -> Adjusted: 23127.83395444 , mean: 24209.55738672, std: 1081.72343227

=== Model Feature Importance ===
squareMeters 0.5136666666666667
garage 0.105
basement 0.08366666666666667
attic 0.06366666666666666
floors 0.056666666666666664
numberOfRooms 0.043666666666666666
numPrevOwners 0.03933333333333333
made 0.034
hasGuestRoom 0.022333333333333334
cityPartRange 0.017
hasStorageRoom 0.006
hasPool 0.004
hasStormProtector 0.0036666666666666666
isNewBuilt 0.0036666666666666666
hasYard 0.0023333333333333335
is_original 0.0013333333333333333


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0,3455080.0
1,9529130.0
2,9290590.0
3,9745980.0
4,6179610.0


Mode
=== Target Value Counts ===
Model Run Time: 5.30
Model=lgbm2
{}
fold: 1, Score: 888525.0002754522, Run Time: 0.46
fold: 2, Score: 901793.4857514688, Run Time: 0.45
fold: 3, Score: 899351.9110854829, Run Time: 0.50
fold: 4, Score: 894175.8862485467, Run Time: 0.46
fold: 5, Score: 905186.0213285397, Run Time: 0.53
Scores -> Adjusted: 891940.12853707 , mean: 897806.46093790, std: 5866.33240083

=== Model Feature Importance ===
squareMeters 0.197
made 0.104
numberOfRooms 0.101
garage 0.1
basement 0.096
attic 0.092
floors 0.089
is_original 0.085
numPrevOwners 0.037
hasGuestRoom 0.035
cityPartRange 0.028
hasStormProtector 0.016
hasYard 0.007
hasPool 0.005
hasStorageRoom 0.004
isNewBuilt 0.004


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0,3815410.0
1,7602610.0
2,7711020.0
3,7880600.0
4,5627590.0


Mode
=== Target Value Counts ===
Model Run Time: 2.96
Model=cat1
{}
fold: 1, Score: 30484.07902015963, Run Time: 5.37
fold: 2, Score: 28842.31460952947, Run Time: 5.73
fold: 3, Score: 35862.65781742862, Run Time: 5.35
fold: 4, Score: 27648.643057464924, Run Time: 6.53
fold: 5, Score: 33776.75252158439, Run Time: 5.44
Scores -> Adjusted: 28257.34273838 , mean: 31322.88940523, std: 3065.54666686

=== Model Feature Importance ===
squareMeters 0.9958598446136466
made 0.0007300467794965642
garage 0.0006798975653166463
basement 0.0005897760660228577
numberOfRooms 0.0004976590497504062
numPrevOwners 0.00042791358611680676
attic 0.00038972243189940085
floors 0.0002123419882391154
hasStormProtector 0.00018459250476685467
cityPartRange 0.000181817685942552
hasGuestRoom 0.0001419370936794741
hasStorageRoom 3.44907722972048e-05
hasPool 3.3933422254825555e-05
hasYard 1.569740050087824e-05
is_original 1.3572601526945236e-05
isNewBuilt 6.756438542821153e-06


Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0,3432650.0
1,9510990.0
2,9281030.0
3,9681020.0
4,6125380.0


Mode
=== Target Value Counts ===
Model Run Time: 28.94
Model=cat2
{}
fold: 1, Score: 29719.48382289648, Run Time: 118.07
fold: 2, Score: 28814.88524723147, Run Time: 29.91
fold: 3, Score: 33430.968999055076, Run Time: 43.76
fold: 4, Score: 30758.985682419716, Run Time: 28.62
fold: 5, Score: 29323.46001391201, Run Time: 31.82
Scores -> Adjusted: 28769.44599318 , mean: 30409.55675310, std: 1640.11075993

=== Model Feature Importance ===
squareMeters 0.9956500359043124
garage 0.0011290328618585898
attic 0.0006914489379681839
hasGuestRoom 0.0005290232589550614
numPrevOwners 0.00034320451988401686
numberOfRooms 0.0003203043483924089
hasYard 0.0002998660921453093
cityPartRange 0.00024559957968358406
basement 0.00024417471537288034
hasStormProtector 0.00013837574627767373
hasStorageRoom 0.00011882039629183303
is_original 9.827314765083981e-05
hasPool 8.073860942421907e-05
isNewBuilt 4.662416577492762e-05
made 3.533268034258756e-05
floors 2.9145035665426853e-05


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0,3473550.0
1,9504870.0
2,9269390.0
3,9688800.0
4,6168380.0


Mode
=== Target Value Counts ===
Model Run Time: 252.92
CPU times: user 56min 30s, sys: 6min 52s, total: 1h 3min 22s
Wall time: 45min 34s


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb_best_params,1570420.0,10692.23914,947.14424
6,lgbm2,897806.0,5866.3324,2.96038
1,lgbm_best_params,402359.0,3657.86568,1096.76371
4,xgb2,145890.0,1267.36845,372.36899
2,cat_best_params,51200.5,1046.03094,13.83051
7,cat1,31322.9,3065.54667,28.9358
8,cat2,30409.6,1640.11076,252.92044
5,lgbm1,24209.6,1081.72343,5.30491
3,xgb1,17301.1,664.27167,14.54139


## Linear Models

In [31]:
model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
model_lst = ["lasso", "ridge",  "ridge_50"]
# model_lst = []
# all_cv_scores = run_models4features(model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

all_cv_scores.head()

Model=lasso
fold: 1, Score: 101625.26299581607, Run Time: 0.03
fold: 2, Score: 1556084.7606959804, Run Time: 0.04
fold: 3, Score: 1545878.3858512293, Run Time: 0.04
fold: 4, Score: 1543038.2759907155, Run Time: 0.04
fold: 5, Score: 1564073.158003011, Run Time: 0.04
Scores -> Adjusted: 681834.37298968 , mean: 1262139.96870735, std: 580305.59571767


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0,4335780.0
1,6881170.0
2,6546510.0
3,6984780.0
4,5121890.0


Mode
=== Target Value Counts ===
Model Run Time: 0.93
Model=ridge
fold: 1, Score: 101683.15383446707, Run Time: 0.04
fold: 2, Score: 1556120.0506844728, Run Time: 0.04
fold: 3, Score: 1545913.716666436, Run Time: 0.04
fold: 4, Score: 1543073.64435113, Run Time: 0.04
fold: 5, Score: 1564108.9079791966, Run Time: 0.04
Scores -> Adjusted: 681883.27912793 , mean: 1262179.89470314, std: 580296.61557521


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0,4335800.0
1,6881110.0
2,6546450.0
3,6984710.0
4,5121880.0


Mode
=== Target Value Counts ===
Model Run Time: 0.80
Model=ridge_50
fold: 1, Score: 101654.73414884658, Run Time: 0.02
fold: 2, Score: 1556102.1862014532, Run Time: 0.04
fold: 3, Score: 1545895.8076678624, Run Time: 0.04
fold: 4, Score: 1543055.7028033254, Run Time: 0.04
fold: 5, Score: 1564090.8047879785, Run Time: 0.04
Scores -> Adjusted: 681859.04647050 , mean: 1262159.84712189, std: 580300.80065140


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0,4335800.0
1,6881150.0
2,6546480.0
3,6984750.0
4,5121880.0


Mode
=== Target Value Counts ===
Model Run Time: 0.79


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb_best_params,1570420.0,10692.23914,947.14424
1,lgbm_best_params,402359.0,3657.86568,1096.76371
2,cat_best_params,51200.5,1046.03094,13.83051
3,xgb1,17301.1,664.27167,14.54139
4,xgb2,145890.0,1267.36845,372.36899


In [32]:
sample_submission.head(20)

Unnamed: 0,id,price,target_xgb_best_params,target_lgbm_best_params,target_cat_best_params,target_xgb1,target_xgb2,target_lgbm1,target_lgbm2,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,22730,4634460.0,4291870.0,4204390.0,4758100.0,4739610.0,4680680.0,4750850.0,4324720.0,4758470.0,4715280.0,4760890.0,4760890.0,4760890.0
1,22731,4634460.0,5267520.0,5677610.0,6177280.0,6196200.0,6196950.0,6190860.0,5563370.0,6145450.0,6180160.0,5279980.0,5279950.0,5279960.0
2,22732,4634460.0,5624820.0,8336290.0,9035790.0,9057930.0,8960760.0,9038840.0,7318470.0,9048330.0,9067030.0,6519690.0,6519630.0,6519670.0
3,22733,4634460.0,1409210.0,1308340.0,1503040.0,1621520.0,1338770.0,1601900.0,2779530.0,1609460.0,1631770.0,1648940.0,1649070.0,1649010.0
4,22734,4634460.0,3657110.0,5874010.0,6733780.0,6764480.0,6562500.0,6745890.0,6037570.0,6687050.0,6703740.0,5350530.0,5350510.0,5350520.0
5,22735,4634460.0,2576360.0,603520.0,99502.4,108245.0,242312.0,103186.0,2185090.0,103126.0,83025.4,113674.0,113860.0,113769.0
6,22736,4634460.0,5874740.0,8606100.0,9867940.0,9910640.0,9476010.0,9869150.0,7405540.0,9893830.0,9834400.0,6441010.0,6440940.0,6440980.0
7,22737,4634460.0,5315430.0,4949980.0,5195330.0,5250570.0,5188100.0,5244460.0,5061120.0,5233410.0,5248370.0,4730450.0,4730450.0,4730450.0
8,22738,4634460.0,4107510.0,5648890.0,5634480.0,5554960.0,5421800.0,5566250.0,5136580.0,5581620.0,5533730.0,4835170.0,4835160.0,4835160.0
9,22739,4634460.0,4956550.0,8871030.0,9733080.0,9812260.0,9568630.0,9789900.0,7446870.0,9790730.0,9739780.0,6239970.0,6239910.0,6239940.0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [33]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [34]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 3 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

sample_submission[TARGET] = sample_submission[TARGET] #.astype(int)

In [35]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,price
15146,37876,6833230.0
15147,37877,2175430.0
15148,37878,3076950.0
15149,37879,8620070.0
15150,37880,4535790.0
15151,37881,7337840.0
15152,37882,7752340.0
15153,37883,1778300.0


In [36]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb_best_params,1570420.0,10692.23914,947.14424
10,ridge,1262180.0,580296.61558,0.79642
11,ridge_50,1262160.0,580300.80065,0.79171
9,lasso,1262140.0,580305.59572,0.92917
6,lgbm2,897806.0,5866.3324,2.96038
1,lgbm_best_params,402359.0,3657.86568,1096.76371
4,xgb2,145890.0,1267.36845,372.36899
2,cat_best_params,51200.5,1046.03094,13.83051
7,cat1,31322.9,3065.54667,28.9358
8,cat2,30409.6,1640.11076,252.92044


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [37]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [38]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [39]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
   id    pred_cat1
0   0  3.43265e+06
1   1  9.51099e+06
2   2  9.28103e+06
3   3  9.68102e+06
4   4  6.12538e+06
Processing train_pred_cat2, train_pred_cat2.csv
   id    pred_cat2
0   0  3.47355e+06
1   1  9.50487e+06
2   2  9.26939e+06
3   3  9.68880e+06
4   4  6.16838e+06
Processing train_pred_lgbm1, train_pred_lgbm1.csv
   id   pred_lgbm1
0   0  3.45508e+06
1   1  9.52913e+06
2   2  9.29059e+06
3   3  9.74598e+06
4   4  6.17961e+06
Processing train_pred_lgbm2, train_pred_lgbm2.csv
   id   pred_lgbm2
0   0  3.81541e+06
1   1  7.60261e+06
2   2  7.71102e+06
3   3  7.88060e+06
4   4  5.62759e+06
Processing train_pred_xgb1, train_pred_xgb1.csv
   id  pred_xgb1
0   0  3429240.0
1   1  9518231.0
2   2  9268216.0
3   3  9764527.0
4   4  6179530.0
submission_cat1, submission_cat1.csv
      id        price
0  22730  4.75847e+06
1  22731  6.14545e+06
2  22732  9.04833e+06
3  22733  1.60946e+06
4  22734  6.68705e+06
submission_cat2, submission_c

In [40]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,3432650.0,3473550.0,3455080.0,3815410.0,3429240.0
1,9510990.0,9504870.0,9529130.0,7602610.0,9518231.0
2,9281030.0,9269390.0,9290590.0,7711020.0,9268216.0
3,9681020.0,9688800.0,9745980.0,7880600.0,9764527.0
4,6125380.0,6168380.0,6179610.0,5627590.0,6179530.0


In [41]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,4758470.0,4715280.0,4750850.0,4324720.0,4739607.5
1,6145450.0,6180160.0,6190860.0,5563370.0,6196195.5
2,9048330.0,9067030.0,9038840.0,7318470.0,9057930.0
3,1609460.0,1631770.0,1601900.0,2779530.0,1621524.5
4,6687050.0,6703740.0,6745890.0,6037570.0,6764482.0


In [42]:
type(preds_df)

pandas.core.frame.DataFrame

In [43]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [44]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [45]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,3432650.0,3473550.0,3455080.0,3815410.0,3429240.0
1,9510990.0,9504870.0,9529130.0,7602610.0,9518231.0
2,9281030.0,9269390.0,9290590.0,7711020.0,9268216.0
3,9681020.0,9688800.0,9745980.0,7880600.0,9764527.0
4,6125380.0,6168380.0,6179610.0,5627590.0,6179530.0


In [46]:
# preds_df[useful_features].head()

In [47]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [48]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [49]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb_best_params,1570417.18,10692.24,947.14
10,ridge,1262179.89,580296.62,0.8
11,ridge_50,1262159.85,580300.8,0.79
9,lasso,1262139.97,580305.6,0.93
6,lgbm2,897806.46,5866.33,2.96
1,lgbm_best_params,402359.34,3657.87,1096.76
4,xgb2,145890.31,1267.37,372.37
2,cat_best_params,51200.45,1046.03,13.83
7,cat1,31322.89,3065.55,28.94
8,cat2,30409.56,1640.11,252.92
