<a href="https://www.kaggle.com/code/mmellinger66/s3e8-gemstone-pricing-models?scriptVersionId=121067390" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 8 - Gemstone Pricing Models</h1>
</div>

## Problem Type

Regression

## Evaluation Metric

$$RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y_i})^2}$$

```python
score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
```

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="price"
ID="id"

# Optuna
objective_direction = "minimize"  # minimize, maximize

In [3]:
class Config:
    path:str = "../input/playground-series-s3e8/"
    gpu:bool = True
    optimize:bool = True
    n_optuna_trials:int = 30 # 5, 10, 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 100  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    """Deprecated: Use summary_statistics()"""
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    if Config.gpu:
        lgbm_dict["device"] = "gpu"
        lgbm_dict["boosting_type"] = "gbdt"
        lgbm_dict["gpu_platform_id"] = 0
        lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    if Config.gpu:
        params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
#         "eval_metric": "rmse",  # auc, rmse, mae
        "eval_metric": trial.suggest_categorical("eval_metric", ["rmse", "mae"]),
        "objective": trial.suggest_categorical("objective", ["reg:squarederror"]), # "reg:squarederror",
        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["gpu_hist"]
        ),  # hist, gpu_hist
        "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=193573, Columns=11
 test data : Rows=129050, Columns=10
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453



[1m[91m=== Train Column Names ===[0m



Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price']
[1m[91mCategorical features:[0m ['cut', 'color', 'clarity']

 --- Cardinality of Categorical Features ---

[1m[91mcut[0m: cardinality=5, ['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
[1m[91mcolor[0m: cardinality=7, ['F' 'J' 'G' 'E' 'D' 'H' 'I']
[1m[91mclarity[0m: cardinality=8, ['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']

[1m[91m=== Skewness ===[0m

price    1.60558
carat    0.99513
z        0.68567
table    0.61906
x        0.36105
y        0.35676
id       0.00000
depth   -0.27638
dtype: float64
CPU times: user 324 ms, sys: 80.5 ms, total: 404 ms
Wall time: 680 ms


In [8]:
train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [9]:
original = pd.read_csv("../input/gemstone-price-prediction/cubic_zirconia.csv", index_col=[0])
original = original[-original.depth.isna()]
original.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [10]:
original.shape

(26270, 10)

In [11]:
train['is_original']    = 0
test['is_original']     = 0
original['is_original'] = 1
combined = pd.concat([train, original], ignore_index=True).drop_duplicates()
train = combined

In [12]:
combined.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,is_original
0,0.0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,0
1,1.0,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,0
2,2.0,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,0
3,3.0,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,0
4,4.0,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,0


In [13]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
carat,219809.0,0.79,0.46,0.2,0.4,0.7,1.03,4.5,0.22,1.01,0.63
depth,219809.0,61.81,1.13,50.8,61.2,61.9,62.4,73.6,1.27,-0.24,3.07
table,219809.0,57.25,1.96,49.0,56.0,57.0,58.0,79.0,3.84,0.66,1.04
x,219809.0,5.72,1.11,0.0,4.7,5.7,6.52,10.23,1.24,0.36,-0.78
y,219809.0,5.72,1.11,0.0,4.71,5.72,6.51,58.9,1.23,0.85,23.12
z,219809.0,3.53,0.69,0.0,2.9,3.53,4.03,31.3,0.48,0.65,11.15
price,219809.0,3965.19,4032.64,326.0,949.0,2398.0,5405.0,18818.0,16262215.44,1.61,2.11
is_original,219809.0,0.12,0.32,0.0,0.0,0.0,0.0,1.0,0.11,2.35,3.51


## Outlier Detection

In [14]:
# https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
    
def iqr(data:pd.DataFrame, var:str):# outliers detecion .
    q1 = np.quantile(data[var], 0.25)
    q3 = np.quantile(data[var], 0.75)
    diff = q3 - q1
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    return data[(data[var] < lower_t) | (data[var] > upper_t)]

# iqr(train, "squareMeters")

In [15]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy

def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(train)


Unnamed: 0,Outlier_percentage
is_original,11.93582
price,6.53067
depth,4.58989
carat,3.96799
table,2.54721
z,0.0182
x,0.01456
y,0.01319
id,0.0


In [16]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy
    
def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(test)


Unnamed: 0,Outlier_percentage
depth,5.06083
carat,3.92096
table,2.30918
z,0.01937
x,0.00697
y,0.00697
id,0.0
is_original,0.0


In [17]:
# iqr(train,"floors")

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

In [18]:
# train.drop(['cityCode'], axis=1, inplace=True)
# test.drop(['cityCode'], axis=1, inplace=True)

## Handle Outliers
- https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
- https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

In [19]:
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors', 'cityCode', 'squareMeters']
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors',  'squareMeters']

In [20]:
# https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

def remove_outliers(df:pd.DataFrame) -> pd.DataFrame:
    for c in features_with_outliers:
        if c == 'garage':
            first_percentile = df[c].quantile(0.001)
            df = df[df[c] > first_percentile]

        ninety_ninth_percentile = df[c].quantile(0.999)
        df = df[df[c] < ninety_ninth_percentile]
        #df_t = df_t[(df_t[c] > first_percentile) & (df_t[c] < ninety_ninth_percentile)]
    return df


In [21]:
# print(f'Before: {len(train)}')
# train = remove_outliers(train)
# print(f'After: {len(train)}')

In [22]:
train.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,is_original
0,0.0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,0
1,1.0,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,0
2,2.0,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,0
3,3.0,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,0
4,4.0,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,0
5,5.0,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506,0
6,6.0,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229,0
7,7.0,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224,0
8,8.0,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886,0
9,9.0,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421,0


In [23]:
train = train.reset_index(drop=True).copy()
train.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,is_original
0,0.0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,0
1,1.0,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,0
2,2.0,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,0
3,3.0,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,0
4,4.0,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,0
5,5.0,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506,0
6,6.0,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229,0
7,7.0,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224,0
8,8.0,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886,0
9,9.0,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421,0


In [24]:
excluded_features = [TARGET, ID, "fold"]

In [25]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'is_original']

[1m[91mCategorical Features=[0m['cut', 'color', 'clarity']
[1m[91m=== Cardinality ===[0m
cut        5
color      7
clarity    8
dtype: int64


['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'is_original',
 'cut',
 'color',
 'clarity']

In [26]:
# train, test = label_encoder(train, test, cat_features)
train = pd.get_dummies(train,columns=['cut','color','clarity']) # Will remove original feature names
test = pd.get_dummies(test,columns=['cut','color','clarity'])

In [27]:
train.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,is_original,cut_Fair,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.0,1.52,62.2,58.0,7.27,7.33,4.55,13619,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1.0,2.03,62.0,58.0,8.06,8.12,5.05,13387,0,0,...,0,1,0,0,0,1,0,0,0,0
2,2.0,0.7,61.2,57.0,5.69,5.73,3.5,2772,0,0,...,0,0,0,0,0,0,1,0,0,0
3,3.0,0.32,61.6,56.0,4.38,4.41,2.71,666,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4.0,1.7,62.6,59.0,7.65,7.61,4.77,14453,0,0,...,0,0,0,0,0,0,0,1,0,0


In [28]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'is_original', 'cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'is_original',
 'cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2']

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [29]:
%%time

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best XGB trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction) # minimize, maximize
    study.optimize(
        lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best LGBM trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
# best_cb_params = {}
best_cb_params = {'learning_rate': 0.45743264601999495,
                  'l2_leaf_reg': 41.338946049390074,
                  'bagging_temperature': 0.3472567739474319,
                  'random_strength': 1.7332249677756242, 
                  'depth': 1,
                  'min_data_in_leaf': 6}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction) # minimize, maximize
    study.optimize(
        lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best Cat trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

[32m[I 2023-03-04 23:08:04,048][0m A new study created in memory with name: no-name-cedef46b-0129-4282-b62f-549708498d10[0m
[32m[I 2023-03-04 23:08:08,635][0m Trial 0 finished with value: 679.6489985497684 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1000, 'learning_rate': 0.08490355241191462, 'subsample': 0.64, 'colsample_bytree': 0.09, 'max_depth': 7, 'gamma': 37.800000000000004, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 2.1079316128970733, 'reg_alpha': 4.047678508882151e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.21610910309493594}. Best is trial 0 with value: 679.6489985497684.[0m


Number of boosting rounds: 999


[32m[I 2023-03-04 23:09:07,038][0m Trial 1 finished with value: 581.1566641261364 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.027060856262623972, 'subsample': 0.8, 'colsample_bytree': 0.31, 'max_depth': 18, 'gamma': 43.0, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 2.319955145395528e-08, 'reg_alpha': 4.556601327732554e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 93.53070869971388}. Best is trial 1 with value: 581.1566641261364.[0m


Number of boosting rounds: 1599


[32m[I 2023-03-04 23:09:10,923][0m Trial 2 finished with value: 906.5618848794065 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.05356435582552165, 'subsample': 0.35, 'colsample_bytree': 0.9800000000000001, 'max_depth': 1, 'gamma': 3.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.00043753942867570737, 'reg_alpha': 5.199418416826376, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 3.2939722897296533}. Best is trial 1 with value: 581.1566641261364.[0m


Number of boosting rounds: 2492


[32m[I 2023-03-04 23:09:20,939][0m Trial 3 finished with value: 604.932017257285 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 5000, 'learning_rate': 0.028979591832437594, 'subsample': 0.8099999999999999, 'colsample_bytree': 0.13, 'max_depth': 3, 'gamma': 1.0, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 3.5342310487476336e-06, 'reg_alpha': 1.94189141241345e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 3.3120289129419382}. Best is trial 1 with value: 581.1566641261364.[0m


Number of boosting rounds: 4999


[32m[I 2023-03-04 23:11:48,452][0m Trial 4 finished with value: 615.3847712590878 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4500, 'learning_rate': 0.11616873946205342, 'subsample': 0.91, 'colsample_bytree': 0.15000000000000002, 'max_depth': 13, 'gamma': 11.200000000000001, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 5.025642501950745, 'reg_alpha': 8.076393777726718e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.20677891839190254}. Best is trial 1 with value: 581.1566641261364.[0m


Number of boosting rounds: 1019


[32m[I 2023-03-04 23:11:54,013][0m Trial 5 finished with value: 924.9889143942663 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.23766752913748068, 'subsample': 0.52, 'colsample_bytree': 0.060000000000000005, 'max_depth': 15, 'gamma': 97.4, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.3895525364803435, 'reg_alpha': 3.0825685190374794e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 573.54039694935}. Best is trial 1 with value: 581.1566641261364.[0m


Number of boosting rounds: 2457


[32m[I 2023-03-04 23:12:47,971][0m Trial 6 finished with value: 657.1460648371859 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3500, 'learning_rate': 0.09868261901675937, 'subsample': 0.35, 'colsample_bytree': 0.11, 'max_depth': 18, 'gamma': 93.2, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 5.830736596827993e-07, 'reg_alpha': 0.10023823268047036, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.0867438289458216}. Best is trial 1 with value: 581.1566641261364.[0m


Number of boosting rounds: 3423


[32m[I 2023-03-04 23:13:23,007][0m Trial 7 finished with value: 576.6268775575742 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4100, 'learning_rate': 0.02548747612474775, 'subsample': 0.2, 'colsample_bytree': 0.68, 'max_depth': 10, 'gamma': 45.900000000000006, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.3773465454209988e-06, 'reg_alpha': 20.982275396908076, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 24.473511196982756}. Best is trial 7 with value: 576.6268775575742.[0m


Number of boosting rounds: 839


[32m[I 2023-03-04 23:13:43,029][0m Trial 8 finished with value: 579.8302073679071 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1500, 'learning_rate': 0.1685614364839735, 'subsample': 0.69, 'colsample_bytree': 0.9900000000000001, 'max_depth': 8, 'gamma': 3.2, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.17355046133936552, 'reg_alpha': 1.0351175262733169e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.5632731887539123}. Best is trial 7 with value: 576.6268775575742.[0m


Number of boosting rounds: 35


[32m[I 2023-03-04 23:13:49,976][0m Trial 9 finished with value: 581.6981768325348 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1400, 'learning_rate': 0.04852171274295565, 'subsample': 0.17, 'colsample_bytree': 0.77, 'max_depth': 8, 'gamma': 49.400000000000006, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.5211850621591303e-06, 'reg_alpha': 2.7985903039426107e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 45.562876533740955}. Best is trial 7 with value: 576.6268775575742.[0m


Number of boosting rounds: 1231


[32m[I 2023-03-04 23:14:45,891][0m Trial 10 finished with value: 576.16447681545 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3800, 'learning_rate': 0.011236763751464105, 'subsample': 0.17, 'colsample_bytree': 0.5700000000000001, 'max_depth': 12, 'gamma': 72.2, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.00012529219540999208, 'reg_alpha': 79.37238918995058, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 16.58571642709189}. Best is trial 10 with value: 576.16447681545.[0m


Number of boosting rounds: 998


[32m[I 2023-03-04 23:15:30,029][0m Trial 11 finished with value: 580.622519127852 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3800, 'learning_rate': 0.01035031653425949, 'subsample': 0.11, 'colsample_bytree': 0.5900000000000001, 'max_depth': 12, 'gamma': 72.9, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.00040198890821522296, 'reg_alpha': 58.95810446332887, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 20.439201548018783}. Best is trial 10 with value: 576.16447681545.[0m


Number of boosting rounds: 3222


[32m[I 2023-03-04 23:16:31,688][0m Trial 12 finished with value: 573.2321580179918 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4000, 'learning_rate': 0.010282136059357808, 'subsample': 0.28, 'colsample_bytree': 0.5700000000000001, 'max_depth': 11, 'gamma': 67.60000000000001, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 3.673445449746742e-05, 'reg_alpha': 94.61689023645926, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 13.723259591413413}. Best is trial 12 with value: 573.2321580179918.[0m


Number of boosting rounds: 886


[32m[I 2023-03-04 23:20:06,362][0m Trial 13 finished with value: 585.1512528059601 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3200, 'learning_rate': 0.010212639726670225, 'subsample': 0.33999999999999997, 'colsample_bytree': 0.44, 'max_depth': 15, 'gamma': 72.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.007138683169815972, 'reg_alpha': 0.4901626655932329, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 5.2220283133799965}. Best is trial 12 with value: 573.2321580179918.[0m


Number of boosting rounds: 1103


[32m[I 2023-03-04 23:20:19,477][0m Trial 14 finished with value: 573.9647631698928 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4300, 'learning_rate': 0.014907448432415558, 'subsample': 0.5, 'colsample_bytree': 0.44, 'max_depth': 5, 'gamma': 68.8, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0001230540067323624, 'reg_alpha': 0.00977917739941642, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 15.968597964474597}. Best is trial 12 with value: 573.2321580179918.[0m


Number of boosting rounds: 4086


[32m[I 2023-03-04 23:20:33,121][0m Trial 15 finished with value: 587.1260134083714 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4700, 'learning_rate': 0.015998996741384407, 'subsample': 0.45999999999999996, 'colsample_bytree': 0.32, 'max_depth': 5, 'gamma': 60.2, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.012446930417691644, 'reg_alpha': 0.008842528431084172, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 108.36690802290367}. Best is trial 12 with value: 573.2321580179918.[0m


Number of boosting rounds: 4694


[32m[I 2023-03-04 23:20:46,280][0m Trial 16 finished with value: 578.8358407929198 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4300, 'learning_rate': 0.016817236791263103, 'subsample': 0.44000000000000006, 'colsample_bytree': 0.43, 'max_depth': 5, 'gamma': 29.6, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 60.19347907487866, 'reg_alpha': 0.0013004712848555366, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 7.977241548293591}. Best is trial 12 with value: 573.2321580179918.[0m


Number of boosting rounds: 4295


[32m[I 2023-03-04 23:21:23,691][0m Trial 17 finished with value: 571.0769414066709 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3000, 'learning_rate': 0.017215600566501548, 'subsample': 0.27, 'colsample_bytree': 0.8, 'max_depth': 10, 'gamma': 63.2, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 5.226033575184813e-05, 'reg_alpha': 1.4475859923089087, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 9.667017443653867}. Best is trial 17 with value: 571.0769414066709.[0m


Number of boosting rounds: 449


[32m[I 2023-03-04 23:22:38,830][0m Trial 18 finished with value: 570.3453795804684 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3000, 'learning_rate': 0.016841746742841453, 'subsample': 0.27, 'colsample_bytree': 0.8400000000000001, 'max_depth': 10, 'gamma': 82.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 2.4429526667772036e-05, 'reg_alpha': 3.6941792871710204, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.311116851190873}. Best is trial 18 with value: 570.3453795804684.[0m


Number of boosting rounds: 416


[32m[I 2023-03-04 23:30:27,521][0m Trial 19 finished with value: 585.5077228523377 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2900, 'learning_rate': 0.020956456246950056, 'subsample': 0.28, 'colsample_bytree': 0.8500000000000001, 'max_depth': 15, 'gamma': 84.60000000000001, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.4499332446538783e-05, 'reg_alpha': 1.7921740881013852, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.1825577594035213}. Best is trial 18 with value: 570.3453795804684.[0m


Number of boosting rounds: 260
Number of boosting rounds: 121


[32m[I 2023-03-04 23:39:45,011][0m Trial 20 finished with value: 644.7738685873061 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.04051834639458643, 'subsample': 1.0, 'colsample_bytree': 0.8600000000000001, 'max_depth': 20, 'gamma': 86.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.003266721270438584, 'reg_alpha': 0.268631055203354, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.7699313010839491}. Best is trial 18 with value: 570.3453795804684.[0m
[32m[I 2023-03-04 23:40:40,714][0m Trial 21 finished with value: 571.2283398300225 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3200, 'learning_rate': 0.013778755509349804, 'subsample': 0.25, 'colsample_bytree': 0.7100000000000001, 'max_depth': 11, 'gamma': 61.1, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 3.331979241943637e-05, '

Number of boosting rounds: 496


[32m[I 2023-03-04 23:41:12,703][0m Trial 22 finished with value: 572.5211940695095 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3000, 'learning_rate': 0.02056873623435118, 'subsample': 0.25, 'colsample_bytree': 0.7300000000000001, 'max_depth': 9, 'gamma': 60.900000000000006, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.3188280083877965e-05, 'reg_alpha': 4.258260149590548, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 5.406596640520866}. Best is trial 18 with value: 570.3453795804684.[0m


Number of boosting rounds: 497


[32m[I 2023-03-04 23:43:19,125][0m Trial 23 finished with value: 577.5024936630049 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3400, 'learning_rate': 0.014140092657686978, 'subsample': 0.1, 'colsample_bytree': 0.8700000000000001, 'max_depth': 13, 'gamma': 81.7, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0006051595035461962, 'reg_alpha': 6.125870207535751, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.7006913902819638}. Best is trial 18 with value: 570.3453795804684.[0m


Number of boosting rounds: 435


[32m[I 2023-03-04 23:43:51,484][0m Trial 24 finished with value: 571.6712704133697 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.019912739135573267, 'subsample': 0.4, 'colsample_bytree': 0.67, 'max_depth': 10, 'gamma': 57.2, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 3.1569085487266577e-07, 'reg_alpha': 0.6866991094985365, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 7.375806589594363}. Best is trial 18 with value: 570.3453795804684.[0m


Number of boosting rounds: 455


[32m[I 2023-03-04 23:44:04,957][0m Trial 25 finished with value: 575.5439942590482 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2800, 'learning_rate': 0.014530827663359284, 'subsample': 0.59, 'colsample_bytree': 0.79, 'max_depth': 6, 'gamma': 80.7, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 3.4409898708284944e-05, 'reg_alpha': 0.08428175897698345, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.10427240348922825}. Best is trial 18 with value: 570.3453795804684.[0m


Number of boosting rounds: 2409


[32m[I 2023-03-04 23:44:55,610][0m Trial 26 finished with value: 573.7402656045493 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3400, 'learning_rate': 0.034118239752381305, 'subsample': 0.23, 'colsample_bytree': 0.92, 'max_depth': 9, 'gamma': 29.900000000000002, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 9.57835456541791e-06, 'reg_alpha': 11.421873506140795, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.400529481391768}. Best is trial 18 with value: 570.3453795804684.[0m


Number of boosting rounds: 283


[32m[I 2023-03-04 23:53:07,762][0m Trial 27 finished with value: 585.2322951136798 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2700, 'learning_rate': 0.013142089350040253, 'subsample': 0.32, 'colsample_bytree': 0.66, 'max_depth': 14, 'gamma': 54.1, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 6.604707268838289e-05, 'reg_alpha': 1.3811233992289917, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.9652200852619094}. Best is trial 18 with value: 570.3453795804684.[0m


Number of boosting rounds: 456


[32m[I 2023-03-04 23:54:01,466][0m Trial 28 finished with value: 572.794874646003 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2000, 'learning_rate': 0.018571895906672697, 'subsample': 0.41000000000000003, 'colsample_bytree': 0.78, 'max_depth': 11, 'gamma': 65.2, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.608613411483066e-07, 'reg_alpha': 18.688425395843538, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 4.9287195091803495}. Best is trial 18 with value: 570.3453795804684.[0m


Number of boosting rounds: 382


[32m[I 2023-03-04 23:54:25,874][0m Trial 29 finished with value: 572.2244179272259 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3200, 'learning_rate': 0.022309081957772904, 'subsample': 0.63, 'colsample_bytree': 0.7200000000000001, 'max_depth': 7, 'gamma': 32.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 5.679105514418115e-06, 'reg_alpha': 1.9157622021398637, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.3330512049411097}. Best is trial 18 with value: 570.3453795804684.[0m
[32m[I 2023-03-04 23:54:25,888][0m A new study created in memory with name: no-name-4f7a6ff2-dd83-4e47-b0b0-1aee39e70cae[0m


Number of boosting rounds: 827
Number of finished trials: 30
Best XGB trial parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3000, 'learning_rate': 0.016841746742841453, 'subsample': 0.27, 'colsample_bytree': 0.8400000000000001, 'max_depth': 10, 'gamma': 82.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 2.4429526667772036e-05, 'reg_alpha': 3.6941792871710204, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.311116851190873}
Best score: 570.3453795804684
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 130.952	training's rmse: 193.33	valid_1's l1: 338.9	valid_1's rmse: 665.365
Early stopping, best iteration is:
[33]	training's l1: 267.998	training's rmse: 478.842	valid_1's l1: 311.303	valid_1's rmse: 600.178


[32m[I 2023-03-04 23:55:18,918][0m Trial 0 finished with value: 600.177776291661 and parameters: {'objective': 'rmse', 'n_estimators': 3916, 'reg_alpha': 5.308018497458027e-06, 'reg_lambda': 1.0692465820859229e-08, 'colsample_bytree': 0.8200000000000001, 'num_leaves': 710, 'feature_fraction': 0.6875101064366234, 'bagging_fraction': 0.3441644794446945, 'bagging_freq': 2, 'min_child_samples': 5, 'subsample': 0.69, 'learning_rate': 0.12955140949300928, 'max_depth': 69, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 600.177776291661.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 297.38	valid_1's l1: 312.596
[1000]	training's l1: 289.074	valid_1's l1: 310.635
[1500]	training's l1: 283.268	valid_1's l1: 309.766
[2000]	training's l1: 279.02	valid_1's l1: 309.816
Early stopping, best iteration is:
[1718]	training's l1: 281.208	valid_1's l1: 309.401


[32m[I 2023-03-04 23:56:51,287][0m Trial 1 finished with value: 649.3341774511673 and parameters: {'objective': 'mae', 'n_estimators': 4032, 'reg_alpha': 1.8572048496275245e-07, 'reg_lambda': 6.884340958044906, 'colsample_bytree': 0.6200000000000001, 'num_leaves': 384, 'feature_fraction': 0.6090497628985202, 'bagging_fraction': 0.17982732347338398, 'bagging_freq': 9, 'min_child_samples': 199, 'subsample': 0.4, 'learning_rate': 0.08905681664556965, 'max_depth': 52, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 600.177776291661.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 302.428	valid_1's l1: 320.54
[1000]	training's l1: 292.487	valid_1's l1: 317.476
[1500]	training's l1: 286.145	valid_1's l1: 316.906
[2000]	training's l1: 281.969	valid_1's l1: 315.952
Early stopping, best iteration is:
[1894]	training's l1: 282.758	valid_1's l1: 315.643


[32m[I 2023-03-04 23:58:18,544][0m Trial 2 finished with value: 656.0045429589331 and parameters: {'objective': 'mae', 'n_estimators': 3956, 'reg_alpha': 3.7887250792427193e-06, 'reg_lambda': 1.2624770042152177e-07, 'colsample_bytree': 0.07, 'num_leaves': 101, 'feature_fraction': 0.39735233919608226, 'bagging_fraction': 0.19136568981823748, 'bagging_freq': 10, 'min_child_samples': 276, 'subsample': 0.7, 'learning_rate': 0.23976487094808946, 'max_depth': 64, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 600.177776291661.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 297.293	training's rmse: 603.349	valid_1's l1: 310.026	valid_1's rmse: 632.181
[1000]	training's l1: 289.386	training's rmse: 585.26	valid_1's l1: 306.749	valid_1's rmse: 621.377
[1500]	training's l1: 284.998	training's rmse: 575.358	valid_1's l1: 306.081	valid_1's rmse: 618.379
Did not meet early stopping. Best iteration is:
[1502]	training's l1: 284.977	training's rmse: 575.28	valid_1's l1: 306.066	valid_1's rmse: 618.333


[32m[I 2023-03-04 23:59:41,669][0m Trial 3 finished with value: 618.3332873069843 and parameters: {'objective': 'rmse', 'n_estimators': 1502, 'reg_alpha': 1.8049712963390899e-06, 'reg_lambda': 0.024729702204623533, 'colsample_bytree': 0.43, 'num_leaves': 437, 'feature_fraction': 0.6601265949160193, 'bagging_fraction': 0.2794256229791412, 'bagging_freq': 12, 'min_child_samples': 288, 'subsample': 0.8099999999999999, 'learning_rate': 0.05401273592791739, 'max_depth': 85, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 600.177776291661.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 282.452	training's rmse: 525.386	valid_1's l1: 318.656	valid_1's rmse: 614.773
[1000]	training's l1: 251.385	training's rmse: 458.334	valid_1's l1: 305.863	valid_1's rmse: 593.698
[1500]	training's l1: 235.638	training's rmse: 422.739	valid_1's l1: 304.915	valid_1's rmse: 594.77
Early stopping, best iteration is:
[1032]	training's l1: 250.221	training's rmse: 455.596	valid_1's l1: 305.729	valid_1's rmse: 593.613


[32m[I 2023-03-05 00:01:17,676][0m Trial 4 finished with value: 593.6132462871464 and parameters: {'objective': 'rmse', 'n_estimators': 2821, 'reg_alpha': 0.5703844759580038, 'reg_lambda': 1.0026845940195503e-08, 'colsample_bytree': 0.51, 'num_leaves': 469, 'feature_fraction': 0.2900281425248901, 'bagging_fraction': 0.5371270237864021, 'bagging_freq': 0, 'min_child_samples': 29, 'subsample': 0.53, 'learning_rate': 0.0220475434958851, 'max_depth': 95, 'random_state': 42, 'n_jobs': 4}. Best is trial 4 with value: 593.6132462871464.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 278.217	valid_1's l1: 297.161
Did not meet early stopping. Best iteration is:
[759]	training's l1: 272.993	valid_1's l1: 296.605


[32m[I 2023-03-05 00:01:49,109][0m Trial 5 finished with value: 588.2463958200883 and parameters: {'objective': 'mae', 'n_estimators': 759, 'reg_alpha': 8.605806161457829e-06, 'reg_lambda': 4.757737059857357e-06, 'colsample_bytree': 0.33, 'num_leaves': 673, 'feature_fraction': 0.4240069417231551, 'bagging_fraction': 0.2707291229339935, 'bagging_freq': 8, 'min_child_samples': 126, 'subsample': 0.95, 'learning_rate': 0.11173091998401367, 'max_depth': 12, 'random_state': 42, 'n_jobs': 4}. Best is trial 5 with value: 588.2463958200883.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 283.264	valid_1's l1: 297.57
[1000]	training's l1: 269.986	valid_1's l1: 295.253
[1500]	training's l1: 262.518	valid_1's l1: 295.569
Early stopping, best iteration is:
[1084]	training's l1: 268.524	valid_1's l1: 294.998


[32m[I 2023-03-05 00:04:05,503][0m Trial 6 finished with value: 595.8958874237519 and parameters: {'objective': 'mae', 'n_estimators': 2134, 'reg_alpha': 1.710751083056307, 'reg_lambda': 0.12849943982116782, 'colsample_bytree': 0.9500000000000001, 'num_leaves': 576, 'feature_fraction': 0.9562615903459775, 'bagging_fraction': 0.15551649204809667, 'bagging_freq': 4, 'min_child_samples': 37, 'subsample': 0.65, 'learning_rate': 0.013924218805291528, 'max_depth': 96, 'random_state': 42, 'n_jobs': 4}. Best is trial 5 with value: 588.2463958200883.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 296.714	valid_1's l1: 301.76
[1000]	training's l1: 281.725	valid_1's l1: 289.855
[1500]	training's l1: 276.806	valid_1's l1: 288.506
[2000]	training's l1: 274.456	valid_1's l1: 288.094
Did not meet early stopping. Best iteration is:
[2167]	training's l1: 273.127	valid_1's l1: 287.96


[32m[I 2023-03-05 00:05:26,696][0m Trial 7 finished with value: 575.8949301619286 and parameters: {'objective': 'mae', 'n_estimators': 2167, 'reg_alpha': 0.008310376062450083, 'reg_lambda': 6.640740121910005e-05, 'colsample_bytree': 0.16, 'num_leaves': 96, 'feature_fraction': 0.8781847883427457, 'bagging_fraction': 0.7916580879226861, 'bagging_freq': 0, 'min_child_samples': 129, 'subsample': 0.43000000000000005, 'learning_rate': 0.01076559811300973, 'max_depth': 67, 'random_state': 42, 'n_jobs': 4}. Best is trial 7 with value: 575.8949301619286.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 298.212	valid_1's l1: 304.348
[1000]	training's l1: 289.31	valid_1's l1: 296.97
[1500]	training's l1: 286.015	valid_1's l1: 295.053
[2000]	training's l1: 283.628	valid_1's l1: 293.855
Did not meet early stopping. Best iteration is:
[2470]	training's l1: 281.608	valid_1's l1: 293.082


[32m[I 2023-03-05 00:06:34,578][0m Trial 8 finished with value: 588.4360993053872 and parameters: {'objective': 'mae', 'n_estimators': 2470, 'reg_alpha': 3.7065424533158904e-07, 'reg_lambda': 4.507223035960322, 'colsample_bytree': 0.27, 'num_leaves': 26, 'feature_fraction': 0.8090981540783784, 'bagging_fraction': 0.9617869127102279, 'bagging_freq': 7, 'min_child_samples': 272, 'subsample': 0.11, 'learning_rate': 0.02399630315281099, 'max_depth': 62, 'random_state': 42, 'n_jobs': 4}. Best is trial 7 with value: 575.8949301619286.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 245.704	training's rmse: 477.872	valid_1's l1: 297.433	valid_1's rmse: 580.911
Early stopping, best iteration is:
[182]	training's l1: 267.857	training's rmse: 526.135	valid_1's l1: 293.456	valid_1's rmse: 575.55


[32m[I 2023-03-05 00:08:21,331][0m Trial 9 finished with value: 575.5499676490751 and parameters: {'objective': 'rmse', 'n_estimators': 4129, 'reg_alpha': 0.00503807080207381, 'reg_lambda': 1.1156128267242456e-07, 'colsample_bytree': 0.16999999999999998, 'num_leaves': 542, 'feature_fraction': 0.9513926268869537, 'bagging_fraction': 0.8974576330290169, 'bagging_freq': 5, 'min_child_samples': 118, 'subsample': 0.64, 'learning_rate': 0.04171625789819098, 'max_depth': 95, 'random_state': 42, 'n_jobs': 4}. Best is trial 9 with value: 575.5499676490751.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 391.92	training's rmse: 715.689	valid_1's l1: 408.221	valid_1's rmse: 748.946
[1000]	training's l1: 340.162	training's rmse: 615.414	valid_1's l1: 362.594	valid_1's rmse: 662.241
[1500]	training's l1: 311.511	training's rmse: 573.103	valid_1's l1: 340.283	valid_1's rmse: 632.545
[2000]	training's l1: 297.406	training's rmse: 552.907	valid_1's l1: 331.189	valid_1's rmse: 622.487
[2500]	training's l1: 289.097	training's rmse: 539.977	valid_1's l1: 327.03	valid_1's rmse: 618.052
[3000]	training's l1: 283.007	training's rmse: 529.939	valid_1's l1: 324.424	valid_1's rmse: 615.476
[3500]	training's l1: 278.017	training's rmse: 520.743	valid_1's l1: 322.743	valid_1's rmse: 613.142
[4000]	training's l1: 274.282	training's rmse: 513.173	valid_1's l1: 321.927	valid_1's rmse: 611.923
Early stopping, best iteration is:
[3978]	training's l1: 274.384	training's rmse: 513.411	valid_1's l1: 321.913	valid_1's rmse: 611.8

[32m[I 2023-03-05 00:12:30,238][0m Trial 10 finished with value: 611.8508352456467 and parameters: {'objective': 'rmse', 'n_estimators': 4751, 'reg_alpha': 0.0016263921171846035, 'reg_lambda': 1.7254752411099283e-06, 'colsample_bytree': 0.63, 'num_leaves': 882, 'feature_fraction': 0.11357831981534694, 'bagging_fraction': 0.9804808508175299, 'bagging_freq': 14, 'min_child_samples': 87, 'subsample': 0.21000000000000002, 'learning_rate': 0.0375023400277572, 'max_depth': 31, 'random_state': 42, 'n_jobs': 4}. Best is trial 9 with value: 575.5499676490751.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 288.893	training's rmse: 564.506	valid_1's l1: 300.218	valid_1's rmse: 587.126
[1000]	training's l1: 276.347	training's rmse: 540.503	valid_1's l1: 295.025	valid_1's rmse: 578.216
[1500]	training's l1: 270.715	training's rmse: 527.369	valid_1's l1: 295.167	valid_1's rmse: 577.995
Early stopping, best iteration is:
[1190]	training's l1: 274.037	training's rmse: 535.105	valid_1's l1: 294.87	valid_1's rmse: 577.849


[32m[I 2023-03-05 00:14:58,883][0m Trial 11 finished with value: 577.8485334900114 and parameters: {'objective': 'rmse', 'n_estimators': 3280, 'reg_alpha': 0.004697612287742526, 'reg_lambda': 0.00013222493807699479, 'colsample_bytree': 0.07, 'num_leaves': 228, 'feature_fraction': 0.9954420638146322, 'bagging_fraction': 0.7828641215315894, 'bagging_freq': 5, 'min_child_samples': 175, 'subsample': 0.42000000000000004, 'learning_rate': 0.010657750590894698, 'max_depth': 75, 'random_state': 42, 'n_jobs': 4}. Best is trial 9 with value: 575.5499676490751.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 286.58	valid_1's l1: 294.398
[1000]	training's l1: 272.95	valid_1's l1: 286.896
[1500]	training's l1: 265.792	valid_1's l1: 286.414
[2000]	training's l1: 261.532	valid_1's l1: 286.386
Early stopping, best iteration is:
[1722]	training's l1: 263.705	valid_1's l1: 286.344


[32m[I 2023-03-05 00:17:12,055][0m Trial 12 finished with value: 572.7598715653478 and parameters: {'objective': 'mae', 'n_estimators': 4872, 'reg_alpha': 0.030600377570559004, 'reg_lambda': 7.264506788233598e-05, 'colsample_bytree': 0.22999999999999998, 'num_leaves': 244, 'feature_fraction': 0.8433032552693948, 'bagging_fraction': 0.779926408365267, 'bagging_freq': 0, 'min_child_samples': 111, 'subsample': 0.36, 'learning_rate': 0.010076292721863871, 'max_depth': 33, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 268.657	training's rmse: 516.147	valid_1's l1: 293.474	valid_1's rmse: 573.265
Early stopping, best iteration is:
[448]	training's l1: 270.484	training's rmse: 520.567	valid_1's l1: 293.478	valid_1's rmse: 572.92


[32m[I 2023-03-05 00:18:28,041][0m Trial 13 finished with value: 572.9204038911184 and parameters: {'objective': 'rmse', 'n_estimators': 4838, 'reg_alpha': 0.06210044641990863, 'reg_lambda': 3.1216856855002824e-06, 'colsample_bytree': 0.27, 'num_leaves': 287, 'feature_fraction': 0.8302455176550051, 'bagging_fraction': 0.7791603100663065, 'bagging_freq': 4, 'min_child_samples': 86, 'subsample': 0.31, 'learning_rate': 0.01939898181052071, 'max_depth': 35, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 272.996	valid_1's l1: 287.203
[1000]	training's l1: 260.636	valid_1's l1: 286.455
Early stopping, best iteration is:
[807]	training's l1: 264.688	valid_1's l1: 286.366


[32m[I 2023-03-05 00:20:10,011][0m Trial 14 finished with value: 573.3757819784141 and parameters: {'objective': 'mae', 'n_estimators': 4953, 'reg_alpha': 0.11249586444783723, 'reg_lambda': 0.0006507796021817144, 'colsample_bytree': 0.33, 'num_leaves': 293, 'feature_fraction': 0.7831204693776906, 'bagging_fraction': 0.6765666046465664, 'bagging_freq': 2, 'min_child_samples': 70, 'subsample': 0.28, 'learning_rate': 0.017476614217082822, 'max_depth': 32, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 277.042	training's rmse: 534.196	valid_1's l1: 294.836	valid_1's rmse: 573.903
[1000]	training's l1: 266.047	training's rmse: 507.555	valid_1's l1: 295.034	valid_1's rmse: 575.95
Early stopping, best iteration is:
[579]	training's l1: 274.763	training's rmse: 528.956	valid_1's l1: 294.464	valid_1's rmse: 573.694


[32m[I 2023-03-05 00:21:20,245][0m Trial 15 finished with value: 573.6942633809696 and parameters: {'objective': 'rmse', 'n_estimators': 4631, 'reg_alpha': 5.2514710154226085, 'reg_lambda': 9.221738089739831e-06, 'colsample_bytree': 0.22000000000000003, 'num_leaves': 214, 'feature_fraction': 0.7923506803413136, 'bagging_fraction': 0.6032356175666443, 'bagging_freq': 3, 'min_child_samples': 81, 'subsample': 0.29000000000000004, 'learning_rate': 0.016222206204549405, 'max_depth': 33, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 458.7	training's rmse: 822.658	valid_1's l1: 461.584	valid_1's rmse: 827.392
[1000]	training's l1: 374.839	training's rmse: 692.516	valid_1's l1: 379.053	valid_1's rmse: 699.382
[1500]	training's l1: 352.807	training's rmse: 652.758	valid_1's l1: 357.33	valid_1's rmse: 659.506
[2000]	training's l1: 342.508	training's rmse: 634.904	valid_1's l1: 346.999	valid_1's rmse: 641.093
[2500]	training's l1: 334.42	training's rmse: 622.287	valid_1's l1: 338.987	valid_1's rmse: 628.326
[3000]	training's l1: 330.046	training's rmse: 615.451	valid_1's l1: 334.756	valid_1's rmse: 621.634
Did not meet early stopping. Best iteration is:
[3402]	training's l1: 327.637	training's rmse: 611.376	valid_1's l1: 332.404	valid_1's rmse: 617.666


[32m[I 2023-03-05 00:21:59,370][0m Trial 16 finished with value: 617.6661412547181 and parameters: {'objective': 'rmse', 'n_estimators': 3402, 'reg_alpha': 0.12964759620347552, 'reg_lambda': 0.0014489127977546485, 'colsample_bytree': 0.42, 'num_leaves': 306, 'feature_fraction': 0.6980346458717677, 'bagging_fraction': 0.8380593991410137, 'bagging_freq': 0, 'min_child_samples': 222, 'subsample': 0.11, 'learning_rate': 0.0105534407895498, 'max_depth': 3, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 276.934	valid_1's l1: 289.902
[1000]	training's l1: 267.718	valid_1's l1: 289.182
Early stopping, best iteration is:
[885]	training's l1: 269.296	valid_1's l1: 289.115


[32m[I 2023-03-05 00:23:24,559][0m Trial 17 finished with value: 579.1525671552995 and parameters: {'objective': 'mae', 'n_estimators': 4458, 'reg_alpha': 0.0002097190367514703, 'reg_lambda': 1.8297307426625018e-05, 'colsample_bytree': 0.41, 'num_leaves': 168, 'feature_fraction': 0.8740426937178718, 'bagging_fraction': 0.7135464914345645, 'bagging_freq': 6, 'min_child_samples': 159, 'subsample': 0.52, 'learning_rate': 0.02595095627027696, 'max_depth': 44, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 278.262	valid_1's l1: 291.93
[1000]	training's l1: 264.346	valid_1's l1: 287.557
[1500]	training's l1: 256.544	valid_1's l1: 287.586
Early stopping, best iteration is:
[1370]	training's l1: 258.332	valid_1's l1: 287.501


[32m[I 2023-03-05 00:25:55,237][0m Trial 18 finished with value: 574.7903216323375 and parameters: {'objective': 'mae', 'n_estimators': 3348, 'reg_alpha': 0.037094887707903146, 'reg_lambda': 1.198351845614934e-06, 'colsample_bytree': 0.51, 'num_leaves': 344, 'feature_fraction': 0.5137504603767113, 'bagging_fraction': 0.5285195988125967, 'bagging_freq': 2, 'min_child_samples': 57, 'subsample': 0.31, 'learning_rate': 0.01537739635088264, 'max_depth': 22, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 256.166	training's rmse: 503.777	valid_1's l1: 293.718	valid_1's rmse: 576.054
Early stopping, best iteration is:
[337]	training's l1: 263.04	training's rmse: 520.285	valid_1's l1: 292.864	valid_1's rmse: 574.9


[32m[I 2023-03-05 00:27:56,471][0m Trial 19 finished with value: 574.899709550684 and parameters: {'objective': 'rmse', 'n_estimators': 4384, 'reg_alpha': 9.597957091564414, 'reg_lambda': 7.076542248156365e-05, 'colsample_bytree': 0.6900000000000001, 'num_leaves': 868, 'feature_fraction': 0.7454196550354396, 'bagging_fraction': 0.8616699944213052, 'bagging_freq': 3, 'min_child_samples': 106, 'subsample': 0.22, 'learning_rate': 0.024837510251875385, 'max_depth': 49, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 284.471	training's rmse: 567.065	valid_1's l1: 297.234	valid_1's rmse: 590.905
[1000]	training's l1: 276.146	training's rmse: 548.676	valid_1's l1: 295.759	valid_1's rmse: 585.025
[1500]	training's l1: 270.784	training's rmse: 536.614	valid_1's l1: 295.869	valid_1's rmse: 583.413
Early stopping, best iteration is:
[1177]	training's l1: 274.11	training's rmse: 544.033	valid_1's l1: 295.693	valid_1's rmse: 584.214


[32m[I 2023-03-05 00:30:15,048][0m Trial 20 finished with value: 584.2139705298174 and parameters: {'objective': 'rmse', 'n_estimators': 4963, 'reg_alpha': 1.4824214093411521e-08, 'reg_lambda': 0.0028961295882454943, 'colsample_bytree': 0.31, 'num_leaves': 995, 'feature_fraction': 0.8716637291147444, 'bagging_fraction': 0.7259032807971514, 'bagging_freq': 1, 'min_child_samples': 233, 'subsample': 0.38, 'learning_rate': 0.014134307941456106, 'max_depth': 41, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 271.406	valid_1's l1: 286.874
[1000]	training's l1: 259.277	valid_1's l1: 286.657
Early stopping, best iteration is:
[733]	training's l1: 264.861	valid_1's l1: 286.463


[32m[I 2023-03-05 00:31:49,805][0m Trial 21 finished with value: 573.7393765050814 and parameters: {'objective': 'mae', 'n_estimators': 4851, 'reg_alpha': 0.12474505880848165, 'reg_lambda': 0.0003413913951370532, 'colsample_bytree': 0.37, 'num_leaves': 288, 'feature_fraction': 0.7782337388626219, 'bagging_fraction': 0.6403806790686004, 'bagging_freq': 2, 'min_child_samples': 69, 'subsample': 0.26, 'learning_rate': 0.019539170999764034, 'max_depth': 23, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 272.009	valid_1's l1: 286.733
[1000]	training's l1: 258.222	valid_1's l1: 286.161
Early stopping, best iteration is:
[888]	training's l1: 260.923	valid_1's l1: 286.077


[32m[I 2023-03-05 00:33:26,051][0m Trial 22 finished with value: 573.2585397536797 and parameters: {'objective': 'mae', 'n_estimators': 3678, 'reg_alpha': 0.14187741615846444, 'reg_lambda': 0.000596939569463244, 'colsample_bytree': 0.18, 'num_leaves': 259, 'feature_fraction': 0.8311644525667387, 'bagging_fraction': 0.6850896387723007, 'bagging_freq': 4, 'min_child_samples': 45, 'subsample': 0.33, 'learning_rate': 0.018902256664623595, 'max_depth': 31, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 268.134	valid_1's l1: 287.128
[1000]	training's l1: 255.221	valid_1's l1: 287.363
Early stopping, best iteration is:
[601]	training's l1: 265.171	valid_1's l1: 286.925


[32m[I 2023-03-05 00:34:25,432][0m Trial 23 finished with value: 574.2382814870634 and parameters: {'objective': 'mae', 'n_estimators': 3688, 'reg_alpha': 0.804268475525177, 'reg_lambda': 4.910028641609535e-05, 'colsample_bytree': 0.15000000000000002, 'num_leaves': 173, 'feature_fraction': 0.8912905036776333, 'bagging_fraction': 0.7777756728053387, 'bagging_freq': 4, 'min_child_samples': 42, 'subsample': 0.35, 'learning_rate': 0.03169323876324414, 'max_depth': 20, 'random_state': 42, 'n_jobs': 4}. Best is trial 12 with value: 572.7598715653478.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 271.461	valid_1's l1: 287.695
[1000]	training's l1: 253.326	valid_1's l1: 285.564
[1500]	training's l1: 240.412	valid_1's l1: 285.647
Early stopping, best iteration is:
[1117]	training's l1: 249.627	valid_1's l1: 285.494


[32m[I 2023-03-05 00:36:40,824][0m Trial 24 finished with value: 570.946755354518 and parameters: {'objective': 'mae', 'n_estimators': 4312, 'reg_alpha': 0.024376653629946066, 'reg_lambda': 1.736470355842095e-05, 'colsample_bytree': 0.24, 'num_leaves': 394, 'feature_fraction': 0.9958886341457805, 'bagging_fraction': 0.5975995341377236, 'bagging_freq': 6, 'min_child_samples': 5, 'subsample': 0.45999999999999996, 'learning_rate': 0.01284141000442891, 'max_depth': 38, 'random_state': 42, 'n_jobs': 4}. Best is trial 24 with value: 570.946755354518.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 271.326	valid_1's l1: 287.871
[1000]	training's l1: 254.084	valid_1's l1: 285.868
[1500]	training's l1: 241.392	valid_1's l1: 285.724
Early stopping, best iteration is:
[1378]	training's l1: 244.104	valid_1's l1: 285.69


[32m[I 2023-03-05 00:39:14,743][0m Trial 25 finished with value: 570.495649385278 and parameters: {'objective': 'mae', 'n_estimators': 4349, 'reg_alpha': 0.00041131936806103716, 'reg_lambda': 1.425272666034261e-05, 'colsample_bytree': 0.25, 'num_leaves': 400, 'feature_fraction': 0.9459647318157444, 'bagging_fraction': 0.470299698897729, 'bagging_freq': 10, 'min_child_samples': 5, 'subsample': 0.51, 'learning_rate': 0.013009439144167953, 'max_depth': 55, 'random_state': 42, 'n_jobs': 4}. Best is trial 25 with value: 570.495649385278.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 272.94	valid_1's l1: 288.929
[1000]	training's l1: 256.835	valid_1's l1: 287.482
Early stopping, best iteration is:
[782]	training's l1: 262.98	valid_1's l1: 287.244


[32m[I 2023-03-05 00:41:05,438][0m Trial 26 finished with value: 575.6737329012897 and parameters: {'objective': 'mae', 'n_estimators': 4282, 'reg_alpha': 0.0003195475035479226, 'reg_lambda': 1.9649084410973868e-05, 'colsample_bytree': 0.060000000000000005, 'num_leaves': 407, 'feature_fraction': 0.9856898718212199, 'bagging_fraction': 0.4356012064408895, 'bagging_freq': 11, 'min_child_samples': 19, 'subsample': 0.45999999999999996, 'learning_rate': 0.012964071993826785, 'max_depth': 57, 'random_state': 42, 'n_jobs': 4}. Best is trial 25 with value: 570.495649385278.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 273.108	valid_1's l1: 291.572
[1000]	training's l1: 251.524	valid_1's l1: 286.012
[1500]	training's l1: 236.838	valid_1's l1: 286.143
Early stopping, best iteration is:
[1014]	training's l1: 251.017	valid_1's l1: 285.992


[32m[I 2023-03-05 00:43:54,219][0m Trial 27 finished with value: 571.0203702431185 and parameters: {'objective': 'mae', 'n_estimators': 4490, 'reg_alpha': 0.0004872035747233162, 'reg_lambda': 0.00021703382563172414, 'colsample_bytree': 0.22999999999999998, 'num_leaves': 613, 'feature_fraction': 0.9241804892479284, 'bagging_fraction': 0.47580959500759384, 'bagging_freq': 13, 'min_child_samples': 5, 'subsample': 0.58, 'learning_rate': 0.010014736816025405, 'max_depth': 41, 'random_state': 42, 'n_jobs': 4}. Best is trial 25 with value: 570.495649385278.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 261.94	valid_1's l1: 288.088
[1000]	training's l1: 239.411	valid_1's l1: 286.822
Early stopping, best iteration is:
[889]	training's l1: 243.417	valid_1's l1: 286.691


[32m[I 2023-03-05 00:46:29,777][0m Trial 28 finished with value: 573.1930591777265 and parameters: {'objective': 'mae', 'n_estimators': 2956, 'reg_alpha': 0.00012370338063579143, 'reg_lambda': 0.003471494421564408, 'colsample_bytree': 0.47, 'num_leaves': 631, 'feature_fraction': 0.9208147664047083, 'bagging_fraction': 0.48148929016437425, 'bagging_freq': 15, 'min_child_samples': 2, 'subsample': 0.61, 'learning_rate': 0.01360096669861019, 'max_depth': 43, 'random_state': 42, 'n_jobs': 4}. Best is trial 25 with value: 570.495649385278.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 262.159	valid_1's l1: 289.306
[1000]	training's l1: 237.471	valid_1's l1: 287.464
Early stopping, best iteration is:
[903]	training's l1: 241.721	valid_1's l1: 287.349


[32m[I 2023-03-05 00:49:30,250][0m Trial 29 finished with value: 573.3962944785804 and parameters: {'objective': 'mae', 'n_estimators': 3753, 'reg_alpha': 0.0008704488465295522, 'reg_lambda': 7.113767995595767e-07, 'colsample_bytree': 0.78, 'num_leaves': 755, 'feature_fraction': 0.9250761974698254, 'bagging_fraction': 0.41368242739304373, 'bagging_freq': 13, 'min_child_samples': 1, 'subsample': 0.73, 'learning_rate': 0.012012797446875894, 'max_depth': 75, 'random_state': 42, 'n_jobs': 4}. Best is trial 25 with value: 570.495649385278.[0m
[32m[I 2023-03-05 00:49:30,262][0m A new study created in memory with name: no-name-b4970925-a539-4032-96c8-4e048bff0ea3[0m


Number of finished trials: 30
Best LGBM trial parameters: {'objective': 'mae', 'n_estimators': 4349, 'reg_alpha': 0.00041131936806103716, 'reg_lambda': 1.425272666034261e-05, 'colsample_bytree': 0.25, 'num_leaves': 400, 'feature_fraction': 0.9459647318157444, 'bagging_fraction': 0.470299698897729, 'bagging_freq': 10, 'min_child_samples': 5, 'subsample': 0.51, 'learning_rate': 0.013009439144167953, 'max_depth': 55, 'random_state': 42, 'n_jobs': 4}
Best score: 570.495649385278


[32m[I 2023-03-05 00:49:34,188][0m Trial 0 finished with value: 599.8868755998188 and parameters: {'learning_rate': 0.4558747436214443, 'l2_leaf_reg': 94.01729293406162, 'bagging_temperature': 0.34248145907727195, 'random_strength': 1.4422212308786215, 'depth': 9, 'min_data_in_leaf': 211}. Best is trial 0 with value: 599.8868755998188.[0m
[32m[I 2023-03-05 00:49:37,316][0m Trial 1 finished with value: 588.139895767251 and parameters: {'learning_rate': 0.8833966895941529, 'l2_leaf_reg': 55.4639505687652, 'bagging_temperature': 0.188222614466836, 'random_strength': 1.0341297958000213, 'depth': 8, 'min_data_in_leaf': 210}. Best is trial 1 with value: 588.139895767251.[0m
[32m[I 2023-03-05 00:49:40,218][0m Trial 2 finished with value: 621.1761071137706 and parameters: {'learning_rate': 0.45891233799361997, 'l2_leaf_reg': 1.4211571046837503, 'bagging_temperature': 0.41749244613674386, 'random_strength': 1.246694973993343, 'depth': 3, 'min_data_in_leaf': 168}. Best is trial 1 with va

Number of finished trials: 30
Best Cat trial parameters: {'learning_rate': 0.1813228775566588, 'l2_leaf_reg': 2.072026315516037, 'bagging_temperature': 0.11313052821890009, 'random_strength': 1.5946785238544283, 'depth': 10, 'min_data_in_leaf': 36}
Best score: 572.8319992808339
CPU times: user 2h 18min 42s, sys: 11min 58s, total: 2h 30min 40s
Wall time: 1h 43min 42s


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [30]:
train = create_folds(train, Config.N_FOLDS)
# train = create_strat_folds(train, TARGET, Config.N_FOLDS)

n_folds=5, seed=42


In [31]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index(ID, inplace=True)
oof.head()

Unnamed: 0_level_0,price,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,13619,1
1.0,13387,2
2.0,2772,3
3.0,666,2
4.0,14453,0


In [32]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [33]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [34]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [35]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 12 µs, sys: 1e+03 ns, total: 13 µs
Wall time: 16.7 µs


In [36]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params3 = {
    "n_estimators": Config.N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}
    
lgbm_params = gpu_ify_lgbm(lgbm_params)

In [37]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror", # Normal dist
#     "objective": "reg:gamma", # Gamma dist
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params3 = {
    'n_estimators': Config.N_ESTIMATORS,
    'learning_rate': 0.05,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror'
}

xgb_params_gamma = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:gamma", # "reg:gamma", "reg:squarederror"
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params_gpu1 = {'objective': 'reg:squarederror',
              'booster': 'gbtree',
              'eval_metric': 'rmse',
              'n_estimators': 50000,
              'learning_rate': 0.1,
              'max_depth': 8,
              'colsample_bytree': 0.4,
              'subsample': 0.6,
              'alpha': 8,
              'lambda': 2,
              'random_state': Config.seed,
              'tree_method': 'gpu_hist'
              }

if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [38]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)

In [39]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),
    "xgb3": xgb.XGBRegressor(**xgb_params3),
    "xgb_params_gamma": xgb.XGBRegressor(**xgb_params_gamma),
    "xgb_params_gpu1": xgb.XGBRegressor(**xgb_params_gpu1),
    
    
    "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm0": lgb.LGBMRegressor(),
    "lgbm3": lgb.LGBMRegressor(lgbm_params3),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),
}

## Tree Models

In [40]:
%%time

# model_lst = ["xgb3","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
model_lst = ["xgb_params_gpu1","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb_params_gamma", "xgb3", "xgb1", "xgb2", "lgbm0", "lgbm1", "lgbm2", "lgbm3", "cat1", "cat2"]
# model_lst = = []
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=xgb_params_gpu1
{}
fold: 1, Score: 360.07250236791333, Run Time: 1028.32
fold: 2, Score: 352.91830060506, Run Time: 1012.68
fold: 3, Score: 358.2259034320409, Run Time: 1052.13
fold: 4, Score: 358.05251136842867, Run Time: 1021.22
fold: 5, Score: 363.9803710706332, Run Time: 1049.12
Scores -> Adjusted: 355.07708704 , mean: 358.64991777, std: 3.57283073

=== Model Feature Importance ===
y 0.38309133
carat 0.11801322
clarity_I1 0.056569744
x 0.05373859
clarity_SI2 0.04941646
clarity_VVS2 0.04025037
clarity_VVS1 0.034987006
clarity_IF 0.034489233
z 0.034245804
color_J 0.031759277
color_I 0.018446693
color_D 0.017484032
color_E 0.015181355
clarity_SI1 0.014912222
color_G 0.014085329
clarity_VS1 0.013662337
color_F 0.012268059
clarity_VS2 0.01185334
color_H 0.010111348
cut_Ideal 0.009596819
depth 0.005509454
cut_Premium 0.004466726
table 0.004144292
cut_Very Good 0.0031346104
cut_Good 0.0028686465
cut_Fair 0.0028624104
is_original 0.002851176


Unnamed: 0_level_0,pred_xgb_params_gpu1
id,Unnamed: 1_level_1
0.0,13516.91602
1.0,15323.47559
2.0,2965.5896
3.0,645.9751
4.0,15880.12109


Mode
=== Target Value Counts ===
Model Run Time: 5186.11
Model=xgb_best_params
{}
fold: 1, Score: 303.79919716472443, Run Time: 105.70
fold: 2, Score: 298.8002514435463, Run Time: 104.54
fold: 3, Score: 301.17373691900895, Run Time: 105.35
fold: 4, Score: 303.09000735939986, Run Time: 107.54
fold: 5, Score: 307.29453843618467, Run Time: 108.03
Scores -> Adjusted: 300.00590937 , mean: 302.83154626, std: 2.82563689

=== Model Feature Importance ===
y 0.38397956
carat 0.12993799
clarity_I1 0.080062106
clarity_SI2 0.045217704
x 0.041859336
color_J 0.03966889
clarity_VVS2 0.028225865
clarity_SI1 0.027990952
clarity_VVS1 0.026175508
clarity_IF 0.026116787
color_I 0.02203726
clarity_VS1 0.01710142
color_D 0.016552128
color_H 0.013896314
color_E 0.013007715
clarity_VS2 0.01262452
color_F 0.012410733
z 0.011724577
color_G 0.010166348
cut_Fair 0.0067926506
cut_Ideal 0.006193924
depth 0.005258007
table 0.0048787785
cut_Good 0.004740221
cut_Premium 0.0046582795
cut_Very Good 0.004412863
is_origina

Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0.0,13657.47266
1.0,13259.16992
2.0,2841.05469
3.0,682.47125
4.0,15176.80859


Mode
=== Target Value Counts ===
Model Run Time: 536.95
Model=lgbm_best_params
{}
fold: 1, Score: 288.37527302088336, Run Time: 473.28
fold: 2, Score: 283.36385777380286, Run Time: 487.68
fold: 3, Score: 286.73312922361174, Run Time: 497.77
fold: 4, Score: 286.3229895029449, Run Time: 484.42
fold: 5, Score: 290.97319097961275, Run Time: 487.23
Scores -> Adjusted: 284.65126464 , mean: 287.15368810, std: 2.50242346

=== Model Feature Importance ===
depth 0.16673668535560562
x 0.14681968199413226
y 0.1437993696589139
z 0.11986508003741246
carat 0.0950675147284168
table 0.09105138103940007
clarity_SI1 0.017426297405966053
clarity_VS2 0.01649847774183677
color_G 0.015806935135032337
color_F 0.01480707978269426
color_H 0.014452087911201318
color_E 0.014431917918502856
clarity_SI2 0.01411899488892385
is_original 0.013426875996613746
cut_Very Good 0.01328914376075853
clarity_VS1 0.013249380060867275
color_D 0.012545735458443764
cut_Premium 0.011786191161970229
color_I 0.011747580033090314
cut_

Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0.0,13657.25564
1.0,12344.37458
2.0,2811.02951
3.0,648.80975
4.0,15039.45708


Mode
=== Target Value Counts ===
Model Run Time: 2434.81
Model=cat_best_params
{}
fold: 1, Score: 297.85991742594564, Run Time: 74.34
fold: 2, Score: 295.23716657542013, Run Time: 75.64
fold: 3, Score: 295.4808257931766, Run Time: 77.98
fold: 4, Score: 297.6289836187544, Run Time: 79.20
fold: 5, Score: 300.4141965233584, Run Time: 80.80
Scores -> Adjusted: 295.44370201 , mean: 297.32421799, std: 1.88051598

=== Model Feature Importance ===
x 0.1893819796467538
carat 0.15484082957221634
z 0.1250853791315762
y 0.1227103113685988
clarity_SI2 0.11611765833956687
color_J 0.05504783425126655
clarity_SI1 0.050234758412407425
color_I 0.04691467421460124
color_H 0.024343713000149324
clarity_VVS2 0.01932444212251238
clarity_I1 0.017261254123560266
color_D 0.011450924381603415
clarity_VVS1 0.010014629488261477
color_E 0.009694172485178643
color_F 0.0078028600285727934
clarity_VS1 0.007028598997148073
clarity_IF 0.006961318513469805
table 0.005843480817444696
depth 0.0055307835459341035
clarity_VS

Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0.0,13767.2117
1.0,12389.43877
2.0,2829.64674
3.0,714.2666
4.0,14997.21632


Mode
=== Target Value Counts ===
Model Run Time: 391.92
Model=xgb_params_gamma
{}
fold: 1, Score: 3898.0606449563315, Run Time: 6.32
fold: 2, Score: 3849.514816166863, Run Time: 8.48
fold: 3, Score: 3899.194363449622, Run Time: 9.22
fold: 4, Score: 3901.365350579137, Run Time: 11.27
fold: 5, Score: 3922.379299632488, Run Time: 12.37
Scores -> Adjusted: 3870.09463834 , mean: 3894.10289496, std: 24.00825662

=== Model Feature Importance ===
carat 0.33110264
y 0.29750657
x 0.24287269
z 0.06549757
clarity_SI1 0.009474104
clarity_SI2 0.007550494
clarity_IF 0.00572446
color_I 0.005653258
color_H 0.0055321204
clarity_VVS1 0.0054904507
color_G 0.003685114
color_J 0.0036021525
clarity_VVS2 0.0025136047
clarity_VS1 0.0023223537
color_E 0.0023187047
color_D 0.0022153168
clarity_VS2 0.0021623692
cut_Very Good 0.0012833503
cut_Good 0.0011209853
clarity_I1 0.00085408404
depth 0.0008469944
color_F 0.0006705894
cut_Premium 0.0
cut_Ideal 0.0
cut_Fair 0.0
is_original 0.0
table 0.0


Unnamed: 0_level_0,pred_xgb_params_gamma
id,Unnamed: 1_level_1
0.0,73.71144
1.0,73.78519
2.0,72.22537
3.0,67.37559
4.0,73.77097


Mode
=== Target Value Counts ===
Model Run Time: 51.58
Model=xgb3
{}
fold: 1, Score: 293.9553247403797, Run Time: 41.54
fold: 2, Score: 290.14835049553847, Run Time: 42.55
fold: 3, Score: 291.14316505212423, Run Time: 45.34
fold: 4, Score: 292.3973887933678, Run Time: 46.50
fold: 5, Score: 297.8757010282207, Run Time: 47.85
Scores -> Adjusted: 290.39940023 , mean: 293.10398602, std: 2.70458579

=== Model Feature Importance ===
y 0.43240333
carat 0.18689513
z 0.08496361
clarity_SI2 0.057773497
clarity_SI1 0.035919834
x 0.026572527
color_J 0.022003427
clarity_VVS2 0.019483283
color_I 0.017445471
clarity_I1 0.017142376
clarity_VS1 0.015768565
clarity_VS2 0.013745308
color_H 0.011543305
clarity_VVS1 0.011440151
clarity_IF 0.010974195
color_G 0.008150793
color_F 0.008114331
color_D 0.0061891032
color_E 0.0060649826
cut_Ideal 0.0035599978
cut_Premium 0.0006901788
cut_Fair 0.0006765477
depth 0.0005626726
table 0.00054979615
is_original 0.0005249471
cut_Good 0.0004448577
cut_Very Good 0.000397

Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0.0,13612.75195
1.0,12763.50098
2.0,2853.98657
3.0,679.80695
4.0,14959.10059


Mode
=== Target Value Counts ===
Model Run Time: 227.80
Model=xgb1
{}
fold: 1, Score: 300.261136499225, Run Time: 24.00
fold: 2, Score: 297.09857987114526, Run Time: 25.19
fold: 3, Score: 297.1217139446997, Run Time: 26.02
fold: 4, Score: 300.4263153978607, Run Time: 28.74
fold: 5, Score: 305.8266371373839, Run Time: 29.92
Scores -> Adjusted: 296.95957228 , mean: 300.14687657, std: 3.18730429

=== Model Feature Importance ===
y 0.44432917
carat 0.189247
clarity_SI2 0.060984682
clarity_VVS2 0.04688364
clarity_SI1 0.043726966
color_J 0.040509053
clarity_I1 0.031042345
color_I 0.02868953
clarity_VVS1 0.020526055
color_H 0.012832618
clarity_IF 0.011939833
color_E 0.009981424
clarity_VS2 0.00914228
z 0.008043429
color_D 0.008023813
clarity_VS1 0.008016802
color_F 0.006049277
cut_Ideal 0.0052890666
x 0.0049167485
color_G 0.00472682
cut_Fair 0.0011309641
cut_Good 0.0007778143
cut_Premium 0.00071043696
depth 0.00068980316
is_original 0.00064850535
cut_Very Good 0.00057741825
table 0.0005644214

Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,13991.81445
1.0,12913.0957
2.0,2835.02319
3.0,707.76874
4.0,14808.0752


Mode
=== Target Value Counts ===
Model Run Time: 137.69
Model=xgb2
{}
fold: 1, Score: 294.15562369963266, Run Time: 5.81
fold: 2, Score: 291.277911813454, Run Time: 7.01
fold: 3, Score: 292.01654974398275, Run Time: 8.57
fold: 4, Score: 293.13299444271587, Run Time: 10.51
fold: 5, Score: 298.64995591906927, Run Time: 11.34
Scores -> Adjusted: 291.25337749 , mean: 293.84660712, std: 2.59322963

=== Model Feature Importance ===
y 0.45479876
carat 0.18637744
clarity_SI2 0.06844441
x 0.044280816
clarity_SI1 0.03406336
clarity_VVS2 0.02556466
color_J 0.023295663
clarity_I1 0.019355468
color_I 0.017065585
clarity_VS2 0.01701653
clarity_VS1 0.015333255
clarity_VVS1 0.0143781165
z 0.012738794
clarity_IF 0.011985934
color_F 0.0109004965
color_H 0.010716295
color_G 0.009897274
color_E 0.0096010985
color_D 0.007891184
cut_Ideal 0.0029659434
cut_Premium 0.00057467626
cut_Fair 0.00050951843
is_original 0.0004969559
table 0.0004886386
depth 0.00047096738
cut_Good 0.000426837
cut_Very Good 0.00036131

Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,13773.1084
1.0,12376.01758
2.0,2831.91333
3.0,687.37469
4.0,14881.08301


Mode
=== Target Value Counts ===
Model Run Time: 47.18
Model=lgbm0
{}
fold: 1, Score: 301.81822778522013, Run Time: 4.52
fold: 2, Score: 297.9886424990666, Run Time: 5.43
fold: 3, Score: 299.19160103771475, Run Time: 7.25
fold: 4, Score: 299.76294967145043, Run Time: 9.73
fold: 5, Score: 305.60606462705243, Run Time: 9.93
Scores -> Adjusted: 298.20238546 , mean: 300.87349712, std: 2.67111166

=== Model Feature Importance ===
carat 0.11066666666666666
y 0.09866666666666667
z 0.07966666666666666
x 0.06966666666666667
clarity_SI2 0.060333333333333336
depth 0.058333333333333334
color_J 0.051
clarity_SI1 0.044
color_I 0.04033333333333333
color_D 0.037
clarity_I1 0.035333333333333335
color_H 0.031
color_E 0.03
clarity_VVS1 0.028666666666666667
clarity_IF 0.028333333333333332
clarity_VS2 0.02666666666666667
table 0.026
clarity_VS1 0.025333333333333333
color_F 0.024666666666666667
color_G 0.023
is_original 0.022
clarity_VVS2 0.021
cut_Ideal 0.011333333333333334
cut_Fair 0.006
cut_Premium 0.005

Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0.0,13878.21112
1.0,12348.72859
2.0,2815.0948
3.0,687.40082
4.0,14898.97732


Mode
=== Target Value Counts ===
Model Run Time: 40.91
Model=lgbm1
{}
fold: 1, Score: 297.31491194502684, Run Time: 15.12
fold: 2, Score: 294.63524130375646, Run Time: 9.89
fold: 3, Score: 294.7546159689502, Run Time: 12.63
fold: 4, Score: 296.19037962999926, Run Time: 14.03
fold: 5, Score: 301.842807222985, Run Time: 14.91
Scores -> Adjusted: 294.30836647 , mean: 296.94759121, std: 2.63922474

=== Model Feature Importance ===
carat 0.1528125
y 0.1471875
z 0.131875
depth 0.116875
x 0.116875
table 0.0559375
clarity_SI2 0.03078125
color_J 0.02359375
clarity_SI1 0.0215625
color_I 0.02140625
color_H 0.018125
color_G 0.01390625
color_D 0.01390625
is_original 0.0134375
color_F 0.0134375
clarity_VS2 0.01265625
color_E 0.0125
clarity_VS1 0.01234375
cut_Premium 0.011875
cut_Ideal 0.01015625
clarity_IF 0.0096875
clarity_VVS2 0.009375
clarity_VVS1 0.00890625
cut_Very Good 0.00828125
clarity_I1 0.00828125
cut_Good 0.00234375
cut_Fair 0.001875


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,13620.89038
1.0,12412.74693
2.0,2767.30542
3.0,679.72866
4.0,14789.44281


Mode
=== Target Value Counts ===
Model Run Time: 70.68
Model=lgbm2
{}
fold: 1, Score: 481.1366234502266, Run Time: 3.86
fold: 2, Score: 470.6337274541548, Run Time: 4.39
fold: 3, Score: 480.0217206321727, Run Time: 6.03
fold: 4, Score: 478.23042049303217, Run Time: 7.98
fold: 5, Score: 482.3914465108882, Run Time: 8.94
Scores -> Adjusted: 474.32682312 , mean: 478.48278771, std: 4.15596459

=== Model Feature Importance ===
z 0.164
x 0.111
color_I 0.084
y 0.075
cut_Ideal 0.058
clarity_SI2 0.048
clarity_VS1 0.047
clarity_VVS2 0.045
clarity_VVS1 0.041
color_E 0.04
carat 0.038
clarity_SI1 0.036
depth 0.034
clarity_I1 0.027
clarity_VS2 0.026
clarity_IF 0.02
table 0.019
color_J 0.018
color_H 0.017
color_G 0.015
color_D 0.014
color_F 0.012
cut_Very Good 0.004
is_original 0.003
cut_Fair 0.002
cut_Premium 0.002
cut_Good 0.0


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,11709.42353
1.0,14352.70518
2.0,2740.57168
3.0,895.77281
4.0,13894.80972


Mode
=== Target Value Counts ===
Model Run Time: 36.01
Model=lgbm3
{}
fold: 1, Score: 297.31697593482227, Run Time: 8.39
fold: 2, Score: 294.63474763340673, Run Time: 9.65
fold: 3, Score: 294.75024140046804, Run Time: 11.85
fold: 4, Score: 296.19321771518696, Run Time: 13.01
fold: 5, Score: 301.841663775693, Run Time: 14.12
Scores -> Adjusted: 294.30785941 , mean: 296.94736929, std: 2.63950988

=== Model Feature Importance ===
carat 0.15296875
y 0.14703125
z 0.131875
depth 0.116875
x 0.116875
table 0.0559375
clarity_SI2 0.03078125
color_J 0.02359375
clarity_SI1 0.0215625
color_I 0.02140625
color_H 0.018125
color_G 0.01390625
color_D 0.01390625
is_original 0.0134375
color_F 0.0134375
clarity_VS2 0.01265625
color_E 0.0125
clarity_VS1 0.01234375
cut_Premium 0.011875
cut_Ideal 0.01015625
clarity_IF 0.0096875
clarity_VVS2 0.009375
clarity_VVS1 0.00890625
cut_Very Good 0.00828125
clarity_I1 0.00828125
cut_Good 0.00234375
cut_Fair 0.001875


Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0.0,13620.89039
1.0,12412.74696
2.0,2767.30542
3.0,679.72866
4.0,14789.44289


Mode
=== Target Value Counts ===
Model Run Time: 61.42
Model=cat1
{}
fold: 1, Score: 297.72296419318485, Run Time: 25.48
fold: 2, Score: 291.6784022678925, Run Time: 27.52
fold: 3, Score: 291.6128145710852, Run Time: 28.84
fold: 4, Score: 295.7730765160127, Run Time: 31.13
fold: 5, Score: 300.35632376270496, Run Time: 31.64
Scores -> Adjusted: 292.01435149 , mean: 295.42871626, std: 3.41436477

=== Model Feature Importance ===
x 0.2530733423065527
y 0.17857548223168243
carat 0.17761405168248673
z 0.12402791876514688
clarity_SI2 0.07761358476545545
color_J 0.03499208068243589
clarity_SI1 0.032191704053607495
color_I 0.02955532674473758
clarity_VVS2 0.015774120262406384
color_H 0.015611113786326365
clarity_I1 0.012255643031152844
clarity_VVS1 0.008365661793805232
color_D 0.007170403591097322
color_E 0.006425951555025823
clarity_VS1 0.00618564692780601
clarity_IF 0.0057802683485366005
color_F 0.004895962636701908
depth 0.002275949668450936
cut_Ideal 0.0017033031362601223
clarity_VS2 0.001

Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,13674.64029
1.0,12508.3531
2.0,2873.14491
3.0,703.9258
4.0,14802.73695


Mode
=== Target Value Counts ===
Model Run Time: 148.72
Model=cat2
{}
fold: 1, Score: 319.5716960088878, Run Time: 83.41
fold: 2, Score: 315.53009823735727, Run Time: 4.61
fold: 3, Score: 316.54124577095706, Run Time: 5.67
fold: 4, Score: 319.8212963767485, Run Time: 8.01
fold: 5, Score: 322.9397193116316, Run Time: 8.66
Scores -> Adjusted: 316.25260213 , mean: 318.88081114, std: 2.62820901

=== Model Feature Importance ===
y 0.2396729137081466
carat 0.14630771264688647
z 0.13836182323958687
clarity_SI2 0.11513913709879933
x 0.085144322541499
color_J 0.0550207968375544
clarity_SI1 0.04729547287581312
color_I 0.04581085554042531
color_H 0.02189404762716318
clarity_VVS2 0.02049723109739917
clarity_I1 0.0175743588024976
clarity_VVS1 0.011564945146463682
color_D 0.011314144390975157
color_E 0.010379479637307776
clarity_VS1 0.00866224540139778
color_F 0.00840503672897158
clarity_IF 0.007715691449493301
color_G 0.0034420462976239396
cut_Ideal 0.0024900439916045753
depth 0.001713099554623829


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,13386.04856
1.0,12542.78712
2.0,2834.98344
3.0,761.96224
4.0,14372.34794


Mode
=== Target Value Counts ===
Model Run Time: 114.58
CPU times: user 3h 49min 55s, sys: 8min 28s, total: 3h 58min 23s
Wall time: 2h 38min 6s


Unnamed: 0,Model,Score,StdDev,RunTime
4,xgb_params_gamma,3894.10289,24.00826,51.57628
10,lgbm2,478.48279,4.15596,36.01104
0,xgb_params_gpu1,358.64992,3.57283,5186.11192
13,cat2,318.88081,2.62821,114.57599
1,xgb_best_params,302.83155,2.82564,536.95284
8,lgbm0,300.8735,2.67111,40.90974
6,xgb1,300.14688,3.1873,137.68856
3,cat_best_params,297.32422,1.88052,391.91516
9,lgbm1,296.94759,2.63922,70.67996
11,lgbm3,296.94737,2.63951,61.42468


## Linear Models

In [41]:
model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
model_lst = ["lasso", "ridge",  "ridge_50"]
# model_lst = []
# all_cv_scores = run_models4features(model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

all_cv_scores.head()

Model=lasso
fold: 1, Score: 643.7632608971782, Run Time: 4.58
fold: 2, Score: 638.5058185741259, Run Time: 5.39
fold: 3, Score: 645.5589170948598, Run Time: 7.13
fold: 4, Score: 644.6071047344354, Run Time: 9.53
fold: 5, Score: 640.2022445297636, Run Time: 10.87
Scores -> Adjusted: 639.82110315 , mean: 642.52746917, std: 2.70636601


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,11371.10932
1.0,13266.83948
2.0,3577.55091
3.0,960.97536
4.0,12896.54654


Mode
=== Target Value Counts ===
Model Run Time: 41.41
Model=ridge
fold: 1, Score: 643.5668346413521, Run Time: 1.25
fold: 2, Score: 638.4624429630502, Run Time: 2.55
fold: 3, Score: 645.3704922937586, Run Time: 4.21
fold: 4, Score: 644.4580188631362, Run Time: 6.30
fold: 5, Score: 640.2129918663342, Run Time: 7.04
Scores -> Adjusted: 639.77949762 , mean: 642.41415613, std: 2.63465851


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,11367.43003
1.0,13284.75178
2.0,3572.58221
3.0,972.38309
4.0,12908.35637


Mode
=== Target Value Counts ===
Model Run Time: 25.28
Model=ridge_50
fold: 1, Score: 643.5559094214277, Run Time: 1.25
fold: 2, Score: 638.4514138275706, Run Time: 2.69
fold: 3, Score: 645.3596516273961, Run Time: 4.05
fold: 4, Score: 644.4472570894835, Run Time: 6.28
fold: 5, Score: 640.2023042180139, Run Time: 6.99
Scores -> Adjusted: 639.76861292 , mean: 642.40330724, std: 2.63469432


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,11367.73494
1.0,13285.11925
2.0,3572.51496
3.0,972.53727
4.0,12908.4995


Mode
=== Target Value Counts ===
Model Run Time: 25.50


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb_params_gpu1,358.64992,3.57283,5186.11192
1,xgb_best_params,302.83155,2.82564,536.95284
2,lgbm_best_params,287.15369,2.50242,2434.80513
3,cat_best_params,297.32422,1.88052,391.91516
4,xgb_params_gamma,3894.10289,24.00826,51.57628


In [42]:
sample_submission.head(20)

Unnamed: 0,id,price,target_xgb_params_gpu1,target_xgb_best_params,target_lgbm_best_params,target_cat_best_params,target_xgb_params_gamma,target_xgb3,target_xgb1,target_xgb2,target_lgbm0,target_lgbm1,target_lgbm2,target_lgbm3,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,193573,3969.155,861.21167,875.50262,834.65107,866.86962,67.889,863.24261,854.5777,862.88324,844.24004,828.44949,829.53492,828.44948,868.52349,897.55326,1124.47124,1128.32029,1128.42433
1,193574,3969.155,2437.18628,2456.29346,2374.99327,2588.80995,72.39225,2446.89233,2429.01392,2451.67285,2517.56192,2516.4441,2642.13246,2516.44414,2510.52047,2537.17215,2338.33009,2328.24324,2328.14518
2,193575,3969.155,2058.91797,2195.38818,2245.00602,2337.97173,72.2181,2283.91919,2290.60376,2314.21143,2312.94226,2247.34923,2463.50194,2247.34925,2242.43196,2321.20291,2276.93179,2260.22677,2260.00429
3,193576,3969.155,892.7536,837.19519,912.25762,818.37558,68.04408,823.10901,837.1922,822.96832,830.24297,838.36443,872.48554,838.36446,834.8684,847.65865,1288.785,1298.04216,1298.11253
4,193577,3969.155,5707.84717,5683.90137,5662.78817,5733.19437,73.38731,5757.1499,5573.97803,5763.28418,5695.66514,5535.94468,5719.79569,5535.94469,5652.01644,5806.48923,6795.60765,6784.09533,6784.00964
5,193578,3969.155,684.26581,711.7713,764.86091,704.11572,67.27662,684.28058,727.43109,684.68817,648.79182,702.79588,992.67523,702.79588,720.89036,717.35216,600.64509,611.00995,611.10975
6,193579,3969.155,12085.6416,12396.06445,12286.77968,12334.28384,73.70206,12292.58398,12324.79297,12210.69043,12226.73699,12317.65153,11618.073,12317.65152,12253.93612,12241.94859,11053.27722,11058.11016,11058.20624
7,193580,3969.155,2914.88696,2963.25879,2991.66558,2909.18258,72.27367,2892.91821,2878.87354,2924.43726,2894.70938,2925.28725,2728.58745,2925.28725,2944.63318,2915.9979,3524.39489,3516.87527,3516.75697
8,193581,3969.155,15082.43066,15036.63574,15010.54553,15424.07813,73.78304,14952.72168,14781.29004,15585.53809,14917.44616,14873.12448,14525.17098,14873.12451,15511.43232,15339.23239,15537.79086,15545.98746,15546.49174
9,193582,3969.155,1868.16455,1843.547,1860.25192,1778.45994,71.18585,1843.74231,1848.28992,1846.95813,1959.34844,1869.86673,1900.15767,1869.86676,1773.80944,1908.17055,2245.41956,2239.28853,2239.40512


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [43]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [44]:
model_lst

['lasso', 'ridge', 'ridge_50']

In [45]:
model_lst = ["xgb_params_gpu1", "xgb_best_params","xgb1", "xgb2","xgb3", "cat1", "cat_best_params","lgbm0", "lgbm1", "lgbm3"]

In [46]:
len(model_lst)

10

In [47]:
target_names = [f"target_{model}" for model in model_lst]
target_names

['target_xgb_params_gpu1',
 'target_xgb_best_params',
 'target_xgb1',
 'target_xgb2',
 'target_xgb3',
 'target_cat1',
 'target_cat_best_params',
 'target_lgbm0',
 'target_lgbm1',
 'target_lgbm3']

In [48]:
sample_submission[TARGET] = sample_submission[target_names].sum(axis=1) / len(model_lst)

In [49]:
sample_submission[[ID, TARGET]].to_csv("submission_models_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,price
129042,322615,2920.57636
129043,322616,650.17063
129044,322617,4089.06531
129045,322618,3851.9277
129046,322619,2524.2921
129047,322620,7413.10229
129048,322621,5334.05858
129049,322622,4140.36987


In [50]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 3 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

# sample_submission[TARGET] = sample_submission[TARGET].astype(int)

In [51]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,price
129042,322615,2841.6476
129043,322616,663.57607
129044,322617,4100.38074
129045,322618,3830.50236
129046,322619,2491.78892
129047,322620,7633.52295
129048,322621,5484.56149
129049,322622,4118.39727


In [52]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
4,xgb_params_gamma,3894.10289,24.00826,51.57628
14,lasso,642.52747,2.70637,41.41418
15,ridge,642.41416,2.63466,25.27658
16,ridge_50,642.40331,2.63469,25.49828
10,lgbm2,478.48279,4.15596,36.01104
0,xgb_params_gpu1,358.64992,3.57283,5186.11192
13,cat2,318.88081,2.62821,114.57599
1,xgb_best_params,302.83155,2.82564,536.95284
8,lgbm0,300.8735,2.67111,40.90974
6,xgb1,300.14688,3.1873,137.68856


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [53]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [54]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [55]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
    id    pred_cat1
0  0.0  13674.64029
1  1.0  12508.35310
2  2.0   2873.14491
3  3.0    703.92580
4  4.0  14802.73695
Processing train_pred_cat2, train_pred_cat2.csv
    id    pred_cat2
0  0.0  13386.04856
1  1.0  12542.78712
2  2.0   2834.98344
3  3.0    761.96224
4  4.0  14372.34794
Processing train_pred_lgbm1, train_pred_lgbm1.csv
    id   pred_lgbm1
0  0.0  13620.89038
1  1.0  12412.74693
2  2.0   2767.30542
3  3.0    679.72866
4  4.0  14789.44281
Processing train_pred_lgbm2, train_pred_lgbm2.csv
    id   pred_lgbm2
0  0.0  11709.42353
1  1.0  14352.70518
2  2.0   2740.57168
3  3.0    895.77281
4  4.0  13894.80972
Processing train_pred_xgb1, train_pred_xgb1.csv
    id    pred_xgb1
0  0.0  13991.81400
1  1.0  12913.09600
2  2.0   2835.02320
3  3.0    707.76874
4  4.0  14808.07500
submission_cat1, submission_cat1.csv
       id       price
0  193573   868.52349
1  193574  2510.52047
2  193575  2242.43196
3  193576   834.86840
4  193577

In [56]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,13674.64029,13386.04856,13620.89038,11709.42353,13991.814
1,12508.3531,12542.78712,12412.74693,14352.70518,12913.096
2,2873.14491,2834.98344,2767.30542,2740.57168,2835.0232
3,703.9258,761.96224,679.72866,895.77281,707.76874
4,14802.73695,14372.34794,14789.44281,13894.80972,14808.075


In [57]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,868.52349,897.55326,828.44949,829.53492,854.5777
1,2510.52047,2537.17215,2516.4441,2642.13246,2429.014
2,2242.43196,2321.20291,2247.34923,2463.50194,2290.6038
3,834.8684,847.65865,838.36443,872.48554,837.1922
4,5652.01644,5806.48923,5535.94468,5719.79569,5573.978


In [58]:
type(preds_df)

pandas.core.frame.DataFrame

In [59]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [60]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [61]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,13674.64029,13386.04856,13620.89038,11709.42353,13991.814
1,12508.3531,12542.78712,12412.74693,14352.70518,12913.096
2,2873.14491,2834.98344,2767.30542,2740.57168,2835.0232
3,703.9258,761.96224,679.72866,895.77281,707.76874
4,14802.73695,14372.34794,14789.44281,13894.80972,14808.075


In [62]:
# preds_df[useful_features].head()

In [63]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [64]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [65]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
4,xgb_params_gamma,3894.1,24.01,51.58
14,lasso,642.53,2.71,41.41
15,ridge,642.41,2.63,25.28
16,ridge_50,642.4,2.63,25.5
10,lgbm2,478.48,4.16,36.01
0,xgb_params_gpu1,358.65,3.57,5186.11
13,cat2,318.88,2.63,114.58
1,xgb_best_params,302.83,2.83,536.95
8,lgbm0,300.87,2.67,40.91
6,xgb1,300.15,3.19,137.69
