<a href="https://www.kaggle.com/code/mmellinger66/s3e8-gemstone-pricing-models?scriptVersionId=120985170" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 8 - Gemstone Pricing Models</h1>
</div>

## Problem Type

Regression

## Evaluation Metric

$$RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y_i})^2}$$

```python
score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
```

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="price"
ID="id"

# Optuna
objective_direction = "minimize"  # minimize, maximize

In [3]:
class Config:
    path:str = "../input/playground-series-s3e8/"
    gpu:bool = True
    optimize:bool = True
    n_optuna_trials:int = 30 # 5, 10, 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 100  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    """Deprecated: Use summary_statistics()"""
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    if Config.gpu:
        lgbm_dict["device"] = "gpu"
        lgbm_dict["boosting_type"] = "gbdt"
        lgbm_dict["gpu_platform_id"] = 0
        lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    if Config.gpu:
        params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
#         "eval_metric": "rmse",  # auc, rmse, mae
        "eval_metric": trial.suggest_categorical("eval_metric", ["rmse", "mae"]),
        "objective": trial.suggest_categorical("objective", ["reg:squarederror"]), # "reg:squarederror",
        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["gpu_hist"]
        ),  # hist, gpu_hist
        "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    #  model = CatBoostClassifier(**cb_params)
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=193573, Columns=11
 test data : Rows=129050, Columns=10
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453



[1m[91m=== Train Column Names ===[0m



Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price']
[1m[91mCategorical features:[0m ['cut', 'color', 'clarity']

 --- Cardinality of Categorical Features ---

[1m[91mcut[0m: cardinality=5, ['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
[1m[91mcolor[0m: cardinality=7, ['F' 'J' 'G' 'E' 'D' 'H' 'I']
[1m[91mclarity[0m: cardinality=8, ['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']

[1m[91m=== Skewness ===[0m

price    1.60558
carat    0.99513
z        0.68567
table    0.61906
x        0.36105
y        0.35676
id       0.00000
depth   -0.27638
dtype: float64
CPU times: user 324 ms, sys: 87.3 ms, total: 412 ms
Wall time: 643 ms


In [8]:
train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [9]:
original = pd.read_csv("../input/gemstone-price-prediction/cubic_zirconia.csv", index_col=[0])
original = original[-original.depth.isna()]
original.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [10]:
original.shape

(26270, 10)

In [11]:
train['is_original']    = 0
test['is_original']     = 0
original['is_original'] = 1
combined = pd.concat([train, original], ignore_index=True).drop_duplicates()
train = combined

In [12]:
combined.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,is_original
0,0.0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,0
1,1.0,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,0
2,2.0,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,0
3,3.0,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,0
4,4.0,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,0


In [13]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
carat,219809.0,0.79,0.46,0.2,0.4,0.7,1.03,4.5,0.22,1.01,0.63
depth,219809.0,61.81,1.13,50.8,61.2,61.9,62.4,73.6,1.27,-0.24,3.07
table,219809.0,57.25,1.96,49.0,56.0,57.0,58.0,79.0,3.84,0.66,1.04
x,219809.0,5.72,1.11,0.0,4.7,5.7,6.52,10.23,1.24,0.36,-0.78
y,219809.0,5.72,1.11,0.0,4.71,5.72,6.51,58.9,1.23,0.85,23.12
z,219809.0,3.53,0.69,0.0,2.9,3.53,4.03,31.3,0.48,0.65,11.15
price,219809.0,3965.19,4032.64,326.0,949.0,2398.0,5405.0,18818.0,16262215.44,1.61,2.11
is_original,219809.0,0.12,0.32,0.0,0.0,0.0,0.0,1.0,0.11,2.35,3.51


## Outlier Detection

In [14]:
# https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
    
def iqr(data:pd.DataFrame, var:str):# outliers detecion .
    q1 = np.quantile(data[var], 0.25)
    q3 = np.quantile(data[var], 0.75)
    diff = q3 - q1
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    return data[(data[var] < lower_t) | (data[var] > upper_t)]

# iqr(train, "squareMeters")

In [15]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy

def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(train)


Unnamed: 0,Outlier_percentage
is_original,11.93582
price,6.53067
depth,4.58989
carat,3.96799
table,2.54721
z,0.0182
x,0.01456
y,0.01319
id,0.0


In [16]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy
    
def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(test)


Unnamed: 0,Outlier_percentage
depth,5.06083
carat,3.92096
table,2.30918
z,0.01937
x,0.00697
y,0.00697
id,0.0
is_original,0.0


In [17]:
# iqr(train,"floors")

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

In [18]:
# train.drop(['cityCode'], axis=1, inplace=True)
# test.drop(['cityCode'], axis=1, inplace=True)

## Handle Outliers
- https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
- https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

In [19]:
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors', 'cityCode', 'squareMeters']
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors',  'squareMeters']

In [20]:
# https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

def remove_outliers(df:pd.DataFrame) -> pd.DataFrame:
    for c in features_with_outliers:
        if c == 'garage':
            first_percentile = df[c].quantile(0.001)
            df = df[df[c] > first_percentile]

        ninety_ninth_percentile = df[c].quantile(0.999)
        df = df[df[c] < ninety_ninth_percentile]
        #df_t = df_t[(df_t[c] > first_percentile) & (df_t[c] < ninety_ninth_percentile)]
    return df


In [21]:
# print(f'Before: {len(train)}')
# train = remove_outliers(train)
# print(f'After: {len(train)}')

In [22]:
train.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,is_original
0,0.0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,0
1,1.0,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,0
2,2.0,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,0
3,3.0,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,0
4,4.0,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,0
5,5.0,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506,0
6,6.0,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229,0
7,7.0,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224,0
8,8.0,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886,0
9,9.0,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421,0


In [23]:
train = train.reset_index(drop=True).copy()
train.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,is_original
0,0.0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,0
1,1.0,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,0
2,2.0,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,0
3,3.0,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,0
4,4.0,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,0
5,5.0,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506,0
6,6.0,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229,0
7,7.0,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224,0
8,8.0,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886,0
9,9.0,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421,0


In [24]:
excluded_features = [TARGET, ID, "fold"]

In [25]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'is_original']

[1m[91mCategorical Features=[0m['cut', 'color', 'clarity']
[1m[91m=== Cardinality ===[0m
cut        5
color      7
clarity    8
dtype: int64


['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'is_original',
 'cut',
 'color',
 'clarity']

In [26]:
# train, test = label_encoder(train, test, cat_features)
train = pd.get_dummies(train,columns=['cut','color','clarity']) # Will remove original feature names
test = pd.get_dummies(test,columns=['cut','color','clarity'])

In [27]:
train.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,is_original,cut_Fair,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.0,1.52,62.2,58.0,7.27,7.33,4.55,13619,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1.0,2.03,62.0,58.0,8.06,8.12,5.05,13387,0,0,...,0,1,0,0,0,1,0,0,0,0
2,2.0,0.7,61.2,57.0,5.69,5.73,3.5,2772,0,0,...,0,0,0,0,0,0,1,0,0,0
3,3.0,0.32,61.6,56.0,4.38,4.41,2.71,666,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4.0,1.7,62.6,59.0,7.65,7.61,4.77,14453,0,0,...,0,0,0,0,0,0,0,1,0,0


In [28]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'is_original', 'cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'is_original',
 'cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2']

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [29]:
%%time

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best XGB trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction) # minimize, maximize
    study.optimize(
        lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best LGBM trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
# best_cb_params = {}
best_cb_params = {'learning_rate': 0.45743264601999495,
                  'l2_leaf_reg': 41.338946049390074,
                  'bagging_temperature': 0.3472567739474319,
                  'random_strength': 1.7332249677756242, 
                  'depth': 1,
                  'min_data_in_leaf': 6}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction) # minimize, maximize
    study.optimize(
        lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best Cat trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

[32m[I 2023-03-04 01:05:40,731][0m A new study created in memory with name: no-name-218eca11-998d-4a48-a158-50538e5d5a05[0m
[32m[I 2023-03-04 01:05:44,166][0m Trial 0 finished with value: 796.1069029739548 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1300, 'learning_rate': 0.010192675436761629, 'subsample': 0.16, 'colsample_bytree': 0.5, 'max_depth': 2, 'gamma': 98.4, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 5.716219870388506e-08, 'reg_alpha': 1.023143367596458e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 14.671316899674583}. Best is trial 0 with value: 796.1069029739548.[0m


Number of boosting rounds: 1299


[32m[I 2023-03-04 01:12:28,546][0m Trial 1 finished with value: 644.1276311370863 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.06350593348443016, 'subsample': 0.14, 'colsample_bytree': 0.44, 'max_depth': 15, 'gamma': 71.8, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.74286330312337e-06, 'reg_alpha': 0.3000589686447146, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.29155464940541354}. Best is trial 1 with value: 644.1276311370863.[0m


Number of boosting rounds: 161


[32m[I 2023-03-04 01:12:50,732][0m Trial 2 finished with value: 580.1890641552936 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1200, 'learning_rate': 0.15761589234098655, 'subsample': 0.5, 'colsample_bytree': 0.7000000000000001, 'max_depth': 11, 'gamma': 9.9, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 7.497081232095155e-08, 'reg_alpha': 2.6258981143755554e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 16.755498972218774}. Best is trial 2 with value: 580.1890641552936.[0m


Number of boosting rounds: 39


[32m[I 2023-03-04 01:13:18,598][0m Trial 3 finished with value: 572.6266501131319 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2500, 'learning_rate': 0.01340517535606625, 'subsample': 0.37, 'colsample_bytree': 1.0, 'max_depth': 8, 'gamma': 18.2, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 9.85790185693839, 'reg_alpha': 4.602886803629867e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.19254719685373928}. Best is trial 3 with value: 572.6266501131319.[0m


Number of boosting rounds: 1168


[32m[I 2023-03-04 01:14:09,167][0m Trial 4 finished with value: 571.3280640878486 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4700, 'learning_rate': 0.011330294072861033, 'subsample': 0.67, 'colsample_bytree': 0.48, 'max_depth': 8, 'gamma': 27.400000000000002, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.020961233877195097, 'reg_alpha': 6.070135427632566e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.0999054906113805}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 1645


[32m[I 2023-03-04 01:14:36,768][0m Trial 5 finished with value: 579.0063446996438 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2400, 'learning_rate': 0.04104011798409367, 'subsample': 0.33, 'colsample_bytree': 0.36, 'max_depth': 8, 'gamma': 13.3, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 4.660407304041448e-06, 'reg_alpha': 1.5425258018382004, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.25349208551391655}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 614


[32m[I 2023-03-04 01:15:09,283][0m Trial 6 finished with value: 578.6516844233299 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4900, 'learning_rate': 0.05779118311344711, 'subsample': 0.44000000000000006, 'colsample_bytree': 0.5700000000000001, 'max_depth': 9, 'gamma': 38.800000000000004, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.5050905378398876, 'reg_alpha': 6.68323414415933e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 75.85421173982239}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 856


[32m[I 2023-03-04 01:16:08,090][0m Trial 7 finished with value: 600.5669219342004 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4500, 'learning_rate': 0.08962641131366281, 'subsample': 0.11, 'colsample_bytree': 0.25, 'max_depth': 18, 'gamma': 0.30000000000000004, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 1.5571025959405512e-06, 'reg_alpha': 4.675697349883307e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 36.3993193294072}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 1856


[32m[I 2023-03-04 01:16:14,395][0m Trial 8 finished with value: 572.0645323119332 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1000, 'learning_rate': 0.0479352771014703, 'subsample': 0.6799999999999999, 'colsample_bytree': 0.93, 'max_depth': 7, 'gamma': 5.7, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.00016706043058000282, 'reg_alpha': 0.001314062889520936, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 7.690669361918471}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 278


[32m[I 2023-03-04 01:16:31,143][0m Trial 9 finished with value: 681.5816253853909 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4000, 'learning_rate': 0.2105153543169889, 'subsample': 0.1, 'colsample_bytree': 0.1, 'max_depth': 18, 'gamma': 2.3000000000000003, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 8.999495660194527e-06, 'reg_alpha': 3.238211988939868e-07, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 172.2889654756934}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 3898


[32m[I 2023-03-04 01:16:38,274][0m Trial 10 finished with value: 704.8581552666026 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3700, 'learning_rate': 0.021705949489284372, 'subsample': 0.92, 'colsample_bytree': 0.7300000000000001, 'max_depth': 2, 'gamma': 41.400000000000006, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.03653903075399168, 'reg_alpha': 3.1334959988848126e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 843.8031010849271}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 3636


[32m[I 2023-03-04 01:16:54,561][0m Trial 11 finished with value: 571.7037198447357 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3500, 'learning_rate': 0.027059534986503347, 'subsample': 0.75, 'colsample_bytree': 0.8700000000000001, 'max_depth': 6, 'gamma': 30.1, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0017160570254546954, 'reg_alpha': 0.0015993784827481254, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.3324634869128666}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 618


[32m[I 2023-03-04 01:17:05,614][0m Trial 12 finished with value: 575.12191414343 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3400, 'learning_rate': 0.02144451744729087, 'subsample': 0.73, 'colsample_bytree': 0.78, 'max_depth': 5, 'gamma': 30.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.005241858248790908, 'reg_alpha': 0.002230481079818192, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 2.1297530096026516}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 1328


[32m[I 2023-03-04 01:24:16,999][0m Trial 13 finished with value: 577.8958332660625 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 5000, 'learning_rate': 0.020620505469259445, 'subsample': 0.87, 'colsample_bytree': 0.63, 'max_depth': 12, 'gamma': 60.400000000000006, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.05555265193933808, 'reg_alpha': 5.8138221233556456e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.6519411129024535}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 282


[32m[I 2023-03-04 01:24:30,225][0m Trial 14 finished with value: 576.7271917284214 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4000, 'learning_rate': 0.032032553641897527, 'subsample': 0.6799999999999999, 'colsample_bytree': 0.8500000000000001, 'max_depth': 5, 'gamma': 25.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0004802607806889588, 'reg_alpha': 1.1813418467715568e-08, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 1.6887483902450087}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 880


[32m[I 2023-03-04 01:27:35,503][0m Trial 15 finished with value: 588.604366401228 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3100, 'learning_rate': 0.010628255911839303, 'subsample': 0.8, 'colsample_bytree': 0.31, 'max_depth': 13, 'gamma': 56.1, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.38615466980245844, 'reg_alpha': 0.05978768865865223, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 4.163958566001932}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 1780


[32m[I 2023-03-04 01:27:44,333][0m Trial 16 finished with value: 944.6270334359572 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4300, 'learning_rate': 0.015058657102194856, 'subsample': 1.0, 'colsample_bytree': 0.05, 'max_depth': 5, 'gamma': 45.6, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0030132612837331196, 'reg_alpha': 31.52055941457278, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.5054828785242285}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 4032


[32m[I 2023-03-04 01:28:50,178][0m Trial 17 finished with value: 591.9821006584508 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1900, 'learning_rate': 0.028434065576123834, 'subsample': 0.6, 'colsample_bytree': 0.2, 'max_depth': 15, 'gamma': 26.6, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 75.84878266699955, 'reg_alpha': 1.1095586087095132e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.8157109555081014}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 1899


[32m[I 2023-03-04 01:28:56,587][0m Trial 18 finished with value: 599.5407716716901 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3100, 'learning_rate': 0.016275744726244285, 'subsample': 0.55, 'colsample_bytree': 0.43, 'max_depth': 3, 'gamma': 71.4, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0003660021569982357, 'reg_alpha': 0.00047102461508323777, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 5.3704599060922495}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 3099


[32m[I 2023-03-04 01:31:59,009][0m Trial 19 finished with value: 573.7575861806197 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 4600, 'learning_rate': 0.031562023408253516, 'subsample': 0.76, 'colsample_bytree': 0.6000000000000001, 'max_depth': 10, 'gamma': 32.300000000000004, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.010597278696956213, 'reg_alpha': 0.00690003005809632, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.1309379239351348}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 218


[32m[I 2023-03-04 01:32:16,067][0m Trial 20 finished with value: 576.4541212826646 and parameters: {'eval_metric': 'mae', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 3600, 'learning_rate': 0.014964732841292587, 'subsample': 0.62, 'colsample_bytree': 0.81, 'max_depth': 6, 'gamma': 19.900000000000002, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.3192361778882011, 'reg_alpha': 2.133693657153262e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.7522030967908986}. Best is trial 4 with value: 571.3280640878486.[0m


Number of boosting rounds: 3264


[32m[I 2023-03-04 01:32:29,294][0m Trial 21 finished with value: 570.3259093412167 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2000, 'learning_rate': 0.046801915030475295, 'subsample': 0.83, 'colsample_bytree': 0.9600000000000001, 'max_depth': 7, 'gamma': 8.4, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.00021289869690001217, 'reg_alpha': 0.0004437460656500907, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 6.6748325211183674}. Best is trial 21 with value: 570.3259093412167.[0m


Number of boosting rounds: 331


[32m[I 2023-03-04 01:32:34,227][0m Trial 22 finished with value: 578.3127081355254 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1900, 'learning_rate': 0.04124639405130822, 'subsample': 0.86, 'colsample_bytree': 0.91, 'max_depth': 4, 'gamma': 15.600000000000001, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 7.479768973108583e-05, 'reg_alpha': 0.00014410206869587634, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 3.6066655926735858}. Best is trial 21 with value: 570.3259093412167.[0m


Number of boosting rounds: 1556


[32m[I 2023-03-04 01:32:45,967][0m Trial 23 finished with value: 571.1607373608459 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1900, 'learning_rate': 0.02405605555627122, 'subsample': 0.8099999999999999, 'colsample_bytree': 0.9600000000000001, 'max_depth': 7, 'gamma': 35.1, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0017032215735689512, 'reg_alpha': 0.008830147711153785, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 9.448219736033545}. Best is trial 21 with value: 570.3259093412167.[0m


Number of boosting rounds: 592


[32m[I 2023-03-04 01:33:08,130][0m Trial 24 finished with value: 571.8710815576334 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1800, 'learning_rate': 0.019320447118659614, 'subsample': 0.97, 'colsample_bytree': 0.9800000000000001, 'max_depth': 9, 'gamma': 52.6, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.015737653271004088, 'reg_alpha': 0.011671537482800799, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 9.481231253571321}. Best is trial 21 with value: 570.3259093412167.[0m


Number of boosting rounds: 421


[32m[I 2023-03-04 01:33:18,116][0m Trial 25 finished with value: 569.7688783476295 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1600, 'learning_rate': 0.0115319869620309, 'subsample': 0.82, 'colsample_bytree': 0.67, 'max_depth': 7, 'gamma': 39.6, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 5.4339741841307475e-05, 'reg_alpha': 1.5877825423590973e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 24.999755489084745}. Best is trial 25 with value: 569.7688783476295.[0m


Number of boosting rounds: 1375


[32m[I 2023-03-04 01:33:20,735][0m Trial 26 finished with value: 944.1434879644955 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1600, 'learning_rate': 0.01751225154413802, 'subsample': 0.84, 'colsample_bytree': 0.67, 'max_depth': 1, 'gamma': 38.900000000000006, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 9.279181018321862e-05, 'reg_alpha': 0.00019431599406311077, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 24.04083443371856}. Best is trial 25 with value: 569.7688783476295.[0m


Number of boosting rounds: 1599


[32m[I 2023-03-04 01:34:05,466][0m Trial 27 finished with value: 570.2573971529747 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2200, 'learning_rate': 0.012795878154384226, 'subsample': 0.9, 'colsample_bytree': 0.8, 'max_depth': 10, 'gamma': 63.900000000000006, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0012192180374516906, 'reg_alpha': 2.0360955500860076e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 6.661200532113394}. Best is trial 25 with value: 569.7688783476295.[0m


Number of boosting rounds: 528


[32m[I 2023-03-04 01:35:15,107][0m Trial 28 finished with value: 571.5006683908173 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2200, 'learning_rate': 0.012009201360488986, 'subsample': 0.91, 'colsample_bytree': 0.76, 'max_depth': 14, 'gamma': 83.4, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 4.39710682638045e-05, 'reg_alpha': 2.152929219098576e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 40.736804043115335}. Best is trial 25 with value: 569.7688783476295.[0m


Number of boosting rounds: 673


[32m[I 2023-03-04 01:35:42,884][0m Trial 29 finished with value: 571.1995838221941 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1500, 'learning_rate': 0.010199542572807814, 'subsample': 0.25, 'colsample_bytree': 0.8200000000000001, 'max_depth': 11, 'gamma': 95.5, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.0004150361747157328, 'reg_alpha': 6.283637290578446e-06, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 14.734508181977263}. Best is trial 25 with value: 569.7688783476295.[0m
[32m[I 2023-03-04 01:35:42,897][0m A new study created in memory with name: no-name-fd1ffd6f-4650-45c3-9961-c8676ae708d8[0m


Number of boosting rounds: 814
Number of finished trials: 30
Best XGB trial parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1600, 'learning_rate': 0.0115319869620309, 'subsample': 0.82, 'colsample_bytree': 0.67, 'max_depth': 7, 'gamma': 39.6, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 5.4339741841307475e-05, 'reg_alpha': 1.5877825423590973e-05, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 24.999755489084745}
Best score: 569.7688783476295
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 265.221	valid_1's l1: 301.357
Early stopping, best iteration is:
[472]	training's l1: 266.111	valid_1's l1: 301.159


[32m[I 2023-03-04 01:36:46,635][0m Trial 0 finished with value: 596.3034898786556 and parameters: {'objective': 'mae', 'n_estimators': 2766, 'reg_alpha': 5.395256704985744e-05, 'reg_lambda': 1.3974425967661749e-07, 'colsample_bytree': 0.25, 'num_leaves': 592, 'feature_fraction': 0.28610602886959724, 'bagging_fraction': 0.5501180965491341, 'bagging_freq': 4, 'min_child_samples': 201, 'subsample': 0.39, 'learning_rate': 0.2317771631190606, 'max_depth': 77, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 596.3034898786556.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 202.45	training's rmse: 358.091	valid_1's l1: 305.061	valid_1's rmse: 598.907
Early stopping, best iteration is:
[138]	training's l1: 257.807	training's rmse: 491.516	valid_1's l1: 295.981	valid_1's rmse: 576.689


[32m[I 2023-03-04 01:38:11,125][0m Trial 1 finished with value: 576.6893812081871 and parameters: {'objective': 'rmse', 'n_estimators': 1040, 'reg_alpha': 1.3114969896569153, 'reg_lambda': 3.1495489023989642, 'colsample_bytree': 0.81, 'num_leaves': 828, 'feature_fraction': 0.7232287996191128, 'bagging_fraction': 0.607193159328994, 'bagging_freq': 8, 'min_child_samples': 21, 'subsample': 0.7, 'learning_rate': 0.044006573880141855, 'max_depth': 69, 'random_state': 42, 'n_jobs': 4}. Best is trial 1 with value: 576.6893812081871.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 310.564	valid_1's l1: 318.579
[1000]	training's l1: 302.08	valid_1's l1: 312.783
[1500]	training's l1: 297.598	valid_1's l1: 310.522
[2000]	training's l1: 294.28	valid_1's l1: 309.21
[2500]	training's l1: 291.684	valid_1's l1: 308.219
[3000]	training's l1: 289.493	valid_1's l1: 307.657
[3500]	training's l1: 287.695	valid_1's l1: 307.003
[4000]	training's l1: 286.148	valid_1's l1: 306.641
Did not meet early stopping. Best iteration is:
[4302]	training's l1: 285.29	valid_1's l1: 306.386


[32m[I 2023-03-04 01:40:57,522][0m Trial 2 finished with value: 645.0155041532822 and parameters: {'objective': 'mae', 'n_estimators': 4302, 'reg_alpha': 7.658147119375426e-08, 'reg_lambda': 0.0021468556201437615, 'colsample_bytree': 0.05, 'num_leaves': 499, 'feature_fraction': 0.3270326271878232, 'bagging_fraction': 0.31501466611754386, 'bagging_freq': 9, 'min_child_samples': 295, 'subsample': 0.45999999999999996, 'learning_rate': 0.03420776146814252, 'max_depth': 13, 'random_state': 42, 'n_jobs': 4}. Best is trial 1 with value: 576.6893812081871.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 364.787	training's rmse: 667.831	valid_1's l1: 368.581	valid_1's rmse: 672.672
[1000]	training's l1: 338.361	training's rmse: 620.318	valid_1's l1: 343.259	valid_1's rmse: 625.966
[1500]	training's l1: 330.128	training's rmse: 607.528	valid_1's l1: 335.764	valid_1's rmse: 615.037
[2000]	training's l1: 325.737	training's rmse: 600.119	valid_1's l1: 331.916	valid_1's rmse: 609.382
[2500]	training's l1: 321.814	training's rmse: 594.263	valid_1's l1: 328.56	valid_1's rmse: 605.31
[3000]	training's l1: 319.988	training's rmse: 590.858	valid_1's l1: 327.121	valid_1's rmse: 603.226
Did not meet early stopping. Best iteration is:
[3031]	training's l1: 319.891	training's rmse: 590.704	valid_1's l1: 326.993	valid_1's rmse: 602.987


[32m[I 2023-03-04 01:41:30,965][0m Trial 3 finished with value: 602.9873906639231 and parameters: {'objective': 'rmse', 'n_estimators': 3031, 'reg_alpha': 8.188830967631413e-05, 'reg_lambda': 2.427487903379938e-08, 'colsample_bytree': 0.56, 'num_leaves': 835, 'feature_fraction': 0.2836942617219703, 'bagging_fraction': 0.6018156851293321, 'bagging_freq': 10, 'min_child_samples': 36, 'subsample': 0.84, 'learning_rate': 0.03326803810750478, 'max_depth': 3, 'random_state': 42, 'n_jobs': 4}. Best is trial 1 with value: 576.6893812081871.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 293.962	valid_1's l1: 313.971
[1000]	training's l1: 284.325	valid_1's l1: 312.491
Did not meet early stopping. Best iteration is:
[1129]	training's l1: 282.625	valid_1's l1: 312.16


[32m[I 2023-03-04 01:42:22,468][0m Trial 4 finished with value: 648.3510956034809 and parameters: {'objective': 'mae', 'n_estimators': 1129, 'reg_alpha': 8.115284156242469e-06, 'reg_lambda': 8.557374856595793e-07, 'colsample_bytree': 0.32, 'num_leaves': 547, 'feature_fraction': 0.345230582701497, 'bagging_fraction': 0.25092605044110056, 'bagging_freq': 14, 'min_child_samples': 233, 'subsample': 0.71, 'learning_rate': 0.1807420003920474, 'max_depth': 56, 'random_state': 42, 'n_jobs': 4}. Best is trial 1 with value: 576.6893812081871.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 280.415	valid_1's l1: 299.671
[1000]	training's l1: 271.408	valid_1's l1: 299.767
Early stopping, best iteration is:
[646]	training's l1: 277.316	valid_1's l1: 299.182


[32m[I 2023-03-04 01:43:28,903][0m Trial 5 finished with value: 604.9703735289363 and parameters: {'objective': 'mae', 'n_estimators': 4375, 'reg_alpha': 0.22498431223180665, 'reg_lambda': 0.09590720520294525, 'colsample_bytree': 0.8200000000000001, 'num_leaves': 610, 'feature_fraction': 0.7457850113470852, 'bagging_fraction': 0.26999045782476033, 'bagging_freq': 15, 'min_child_samples': 177, 'subsample': 0.21000000000000002, 'learning_rate': 0.0695057308278039, 'max_depth': 89, 'random_state': 42, 'n_jobs': 4}. Best is trial 1 with value: 576.6893812081871.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 278.752	valid_1's l1: 292.423
[1000]	training's l1: 270.401	valid_1's l1: 291.414
[1500]	training's l1: 264.95	valid_1's l1: 291.466
Early stopping, best iteration is:
[1090]	training's l1: 269.291	valid_1's l1: 291.403


[32m[I 2023-03-04 01:44:34,425][0m Trial 6 finished with value: 585.1620835985083 and parameters: {'objective': 'mae', 'n_estimators': 3982, 'reg_alpha': 0.1427583617085644, 'reg_lambda': 4.1924061141701735e-05, 'colsample_bytree': 0.31, 'num_leaves': 108, 'feature_fraction': 0.8806250578834626, 'bagging_fraction': 0.32077603000470284, 'bagging_freq': 6, 'min_child_samples': 84, 'subsample': 0.8, 'learning_rate': 0.03823092395824644, 'max_depth': 17, 'random_state': 42, 'n_jobs': 4}. Best is trial 1 with value: 576.6893812081871.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 315.174	valid_1's l1: 326.143
[1000]	training's l1: 288.095	valid_1's l1: 305.336
[1500]	training's l1: 277.689	valid_1's l1: 299.676
Did not meet early stopping. Best iteration is:
[1696]	training's l1: 275.012	valid_1's l1: 298.568


[32m[I 2023-03-04 01:45:32,009][0m Trial 7 finished with value: 589.9164445216592 and parameters: {'objective': 'mae', 'n_estimators': 1696, 'reg_alpha': 5.991405626284127e-07, 'reg_lambda': 0.34187526991630357, 'colsample_bytree': 0.6200000000000001, 'num_leaves': 126, 'feature_fraction': 0.20088587640961325, 'bagging_fraction': 0.34159739837031405, 'bagging_freq': 8, 'min_child_samples': 23, 'subsample': 0.51, 'learning_rate': 0.03923472516817343, 'max_depth': 51, 'random_state': 42, 'n_jobs': 4}. Best is trial 1 with value: 576.6893812081871.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 303.422	training's rmse: 620.799	valid_1's l1: 313.581	valid_1's rmse: 646.302
[1000]	training's l1: 294.788	training's rmse: 600.732	valid_1's l1: 309.049	valid_1's rmse: 632.081
[1500]	training's l1: 290.149	training's rmse: 590.255	valid_1's l1: 307.026	valid_1's rmse: 627.001
[2000]	training's l1: 287.063	training's rmse: 582.573	valid_1's l1: 306.444	valid_1's rmse: 623.567
Did not meet early stopping. Best iteration is:
[2430]	training's l1: 284.759	training's rmse: 577.278	valid_1's l1: 305.75	valid_1's rmse: 621.127


[32m[I 2023-03-04 01:47:55,157][0m Trial 8 finished with value: 621.1270777318113 and parameters: {'objective': 'rmse', 'n_estimators': 2430, 'reg_alpha': 3.54353823261153e-07, 'reg_lambda': 2.2567771726316558e-08, 'colsample_bytree': 0.9500000000000001, 'num_leaves': 365, 'feature_fraction': 0.7043225761014664, 'bagging_fraction': 0.2547151440116959, 'bagging_freq': 13, 'min_child_samples': 218, 'subsample': 0.79, 'learning_rate': 0.024326082555490316, 'max_depth': 49, 'random_state': 42, 'n_jobs': 4}. Best is trial 1 with value: 576.6893812081871.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 274.352	valid_1's l1: 296.91
[1000]	training's l1: 264.971	valid_1's l1: 296.436
[1500]	training's l1: 259.109	valid_1's l1: 296.246
Early stopping, best iteration is:
[1160]	training's l1: 262.913	valid_1's l1: 296.104


[32m[I 2023-03-04 01:49:42,748][0m Trial 9 finished with value: 587.9605228484361 and parameters: {'objective': 'mae', 'n_estimators': 3770, 'reg_alpha': 4.344958191821581e-08, 'reg_lambda': 1.7002387442925264e-08, 'colsample_bytree': 0.68, 'num_leaves': 286, 'feature_fraction': 0.36892154939039834, 'bagging_fraction': 0.5219115262092002, 'bagging_freq': 15, 'min_child_samples': 268, 'subsample': 0.49, 'learning_rate': 0.09314598543869866, 'max_depth': 34, 'random_state': 42, 'n_jobs': 4}. Best is trial 1 with value: 576.6893812081871.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 277.651	training's rmse: 548.509	valid_1's l1: 295.458	valid_1's rmse: 579.469
Did not meet early stopping. Best iteration is:
[733]	training's l1: 266.582	training's rmse: 528.722	valid_1's l1: 292.748	valid_1's rmse: 575.32


[32m[I 2023-03-04 01:51:17,353][0m Trial 10 finished with value: 575.3197743378006 and parameters: {'objective': 'rmse', 'n_estimators': 733, 'reg_alpha': 9.157803983309481, 'reg_lambda': 6.878233047132574, 'colsample_bytree': 0.89, 'num_leaves': 920, 'feature_fraction': 0.9759946167464473, 'bagging_fraction': 0.8980801447886753, 'bagging_freq': 1, 'min_child_samples': 115, 'subsample': 1.0, 'learning_rate': 0.011149981112974154, 'max_depth': 98, 'random_state': 42, 'n_jobs': 4}. Best is trial 10 with value: 575.3197743378006.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 274.492	training's rmse: 541.922	valid_1's l1: 294.524	valid_1's rmse: 576.948
Did not meet early stopping. Best iteration is:
[782]	training's l1: 262.023	training's rmse: 517.791	valid_1's l1: 292.878	valid_1's rmse: 574.283


[32m[I 2023-03-04 01:52:56,754][0m Trial 11 finished with value: 574.2831235729169 and parameters: {'objective': 'rmse', 'n_estimators': 782, 'reg_alpha': 4.247208018108193, 'reg_lambda': 8.6266044994449, 'colsample_bytree': 1.0, 'num_leaves': 902, 'feature_fraction': 0.9309726404004741, 'bagging_fraction': 0.9612480061599719, 'bagging_freq': 0, 'min_child_samples': 110, 'subsample': 0.97, 'learning_rate': 0.011711710790610974, 'max_depth': 100, 'random_state': 42, 'n_jobs': 4}. Best is trial 11 with value: 574.2831235729169.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 274.488	training's rmse: 543.052	valid_1's l1: 294.681	valid_1's rmse: 578.21
Did not meet early stopping. Best iteration is:
[788]	training's l1: 262.554	training's rmse: 519.651	valid_1's l1: 293.278	valid_1's rmse: 575.477


[32m[I 2023-03-04 01:54:40,924][0m Trial 12 finished with value: 575.47652575819 and parameters: {'objective': 'rmse', 'n_estimators': 788, 'reg_alpha': 9.831747030647639, 'reg_lambda': 9.144186408839177, 'colsample_bytree': 0.9900000000000001, 'num_leaves': 995, 'feature_fraction': 0.982927952282748, 'bagging_fraction': 0.9826810481354743, 'bagging_freq': 0, 'min_child_samples': 117, 'subsample': 0.99, 'learning_rate': 0.012037198410640878, 'max_depth': 97, 'random_state': 42, 'n_jobs': 4}. Best is trial 11 with value: 574.2831235729169.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 274.314	training's rmse: 543.23	valid_1's l1: 295.235	valid_1's rmse: 580.327
[1000]	training's l1: 258.371	training's rmse: 511.417	valid_1's l1: 293.543	valid_1's rmse: 576.068
Early stopping, best iteration is:
[758]	training's l1: 263.354	training's rmse: 522.999	valid_1's l1: 293.172	valid_1's rmse: 576.59


[32m[I 2023-03-04 01:57:37,300][0m Trial 13 finished with value: 576.5899572879622 and parameters: {'objective': 'rmse', 'n_estimators': 1716, 'reg_alpha': 0.008462270684734216, 'reg_lambda': 0.0693023351451964, 'colsample_bytree': 0.8400000000000001, 'num_leaves': 980, 'feature_fraction': 0.977566918066417, 'bagging_fraction': 0.9938549548024389, 'bagging_freq': 0, 'min_child_samples': 128, 'subsample': 0.96, 'learning_rate': 0.01059005277784895, 'max_depth': 100, 'random_state': 42, 'n_jobs': 4}. Best is trial 11 with value: 574.2831235729169.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 260.254	training's rmse: 510.246	valid_1's l1: 293.376	valid_1's rmse: 573.839
Early stopping, best iteration is:
[474]	training's l1: 261.771	training's rmse: 513.499	valid_1's l1: 293.32	valid_1's rmse: 573.675


[32m[I 2023-03-04 02:00:08,311][0m Trial 14 finished with value: 573.6754876864577 and parameters: {'objective': 'rmse', 'n_estimators': 1713, 'reg_alpha': 9.82041624780192, 'reg_lambda': 7.8038867171029285, 'colsample_bytree': 0.9800000000000001, 'num_leaves': 771, 'feature_fraction': 0.8547317400707262, 'bagging_fraction': 0.8345549766614588, 'bagging_freq': 3, 'min_child_samples': 72, 'subsample': 0.95, 'learning_rate': 0.016936789420339493, 'max_depth': 82, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 573.6754876864577.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 257.301	training's rmse: 503.078	valid_1's l1: 293.955	valid_1's rmse: 575.914
Early stopping, best iteration is:
[456]	training's l1: 259.673	training's rmse: 508.195	valid_1's l1: 293.837	valid_1's rmse: 575.807


[32m[I 2023-03-04 02:02:26,867][0m Trial 15 finished with value: 575.8070574300598 and parameters: {'objective': 'rmse', 'n_estimators': 1684, 'reg_alpha': 0.005238286561526679, 'reg_lambda': 0.005338717174137222, 'colsample_bytree': 0.7100000000000001, 'num_leaves': 728, 'feature_fraction': 0.585250935662304, 'bagging_fraction': 0.8127256887016184, 'bagging_freq': 3, 'min_child_samples': 76, 'subsample': 0.67, 'learning_rate': 0.018823691336717234, 'max_depth': 78, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 573.6754876864577.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 256.945	training's rmse: 503.642	valid_1's l1: 293.332	valid_1's rmse: 574.438
Early stopping, best iteration is:
[390]	training's l1: 263.619	training's rmse: 517.361	valid_1's l1: 293.231	valid_1's rmse: 573.764


[32m[I 2023-03-04 02:04:43,211][0m Trial 16 finished with value: 573.7644271978301 and parameters: {'objective': 'rmse', 'n_estimators': 2140, 'reg_alpha': 0.021408366225093303, 'reg_lambda': 0.41888955564835023, 'colsample_bytree': 0.46, 'num_leaves': 751, 'feature_fraction': 0.8374978082938711, 'bagging_fraction': 0.7732837403277855, 'bagging_freq': 3, 'min_child_samples': 73, 'subsample': 0.85, 'learning_rate': 0.016908769940826872, 'max_depth': 83, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 573.6754876864577.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 252.621	training's rmse: 490.498	valid_1's l1: 294.358	valid_1's rmse: 575.612
Early stopping, best iteration is:
[330]	training's l1: 264.324	training's rmse: 515.253	valid_1's l1: 294.141	valid_1's rmse: 573.748


[32m[I 2023-03-04 02:06:42,886][0m Trial 17 finished with value: 573.7482788547743 and parameters: {'objective': 'rmse', 'n_estimators': 2268, 'reg_alpha': 0.006900462471471935, 'reg_lambda': 0.45787340677107047, 'colsample_bytree': 0.44, 'num_leaves': 715, 'feature_fraction': 0.851469838004477, 'bagging_fraction': 0.7706529018741977, 'bagging_freq': 5, 'min_child_samples': 62, 'subsample': 0.88, 'learning_rate': 0.018327465771112925, 'max_depth': 64, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 573.6754876864577.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 268.714	training's rmse: 532.941	valid_1's l1: 293.542	valid_1's rmse: 576.421
Early stopping, best iteration is:
[452]	training's l1: 270.254	training's rmse: 536.187	valid_1's l1: 293.379	valid_1's rmse: 576.222


[32m[I 2023-03-04 02:08:53,328][0m Trial 18 finished with value: 576.2219138704613 and parameters: {'objective': 'rmse', 'n_estimators': 3340, 'reg_alpha': 0.005037875324922371, 'reg_lambda': 0.018183775616518173, 'colsample_bytree': 0.45, 'num_leaves': 680, 'feature_fraction': 0.8309414446309811, 'bagging_fraction': 0.7519238046224879, 'bagging_freq': 5, 'min_child_samples': 160, 'subsample': 0.62, 'learning_rate': 0.023572703953046833, 'max_depth': 64, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 573.6754876864577.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 263.617	training's rmse: 507.097	valid_1's l1: 294.447	valid_1's rmse: 575.628
Early stopping, best iteration is:
[484]	training's l1: 264.39	training's rmse: 508.892	valid_1's l1: 294.429	valid_1's rmse: 575.529


[32m[I 2023-03-04 02:10:27,407][0m Trial 19 finished with value: 575.5285283572094 and parameters: {'objective': 'rmse', 'n_estimators': 2284, 'reg_alpha': 0.0005910325961361616, 'reg_lambda': 0.000817746100530217, 'colsample_bytree': 0.14, 'num_leaves': 469, 'feature_fraction': 0.5917872984524211, 'bagging_fraction': 0.7044984186677121, 'bagging_freq': 11, 'min_child_samples': 57, 'subsample': 0.32, 'learning_rate': 0.016560614524725476, 'max_depth': 39, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 573.6754876864577.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 242.486	training's rmse: 428.046	valid_1's l1: 296.848	valid_1's rmse: 579.016
Early stopping, best iteration is:
[342]	training's l1: 255.384	training's rmse: 459.178	valid_1's l1: 297.437	valid_1's rmse: 577.475


[32m[I 2023-03-04 02:11:22,004][0m Trial 20 finished with value: 577.4754205135721 and parameters: {'objective': 'rmse', 'n_estimators': 1476, 'reg_alpha': 0.32739011883729013, 'reg_lambda': 0.8717640810662816, 'colsample_bytree': 0.43, 'num_leaves': 369, 'feature_fraction': 0.46795439829600016, 'bagging_fraction': 0.8665447941206629, 'bagging_freq': 6, 'min_child_samples': 7, 'subsample': 0.9, 'learning_rate': 0.025457025235842912, 'max_depth': 66, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 573.6754876864577.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 254.627	training's rmse: 494.206	valid_1's l1: 293.799	valid_1's rmse: 574.333
Early stopping, best iteration is:
[413]	training's l1: 260.167	training's rmse: 506.089	valid_1's l1: 293.699	valid_1's rmse: 573.724


[32m[I 2023-03-04 02:13:35,046][0m Trial 21 finished with value: 573.7238997158819 and parameters: {'objective': 'rmse', 'n_estimators': 2143, 'reg_alpha': 0.05401728141014755, 'reg_lambda': 0.3236086182648126, 'colsample_bytree': 0.46, 'num_leaves': 723, 'feature_fraction': 0.8297753376861396, 'bagging_fraction': 0.7815432634677997, 'bagging_freq': 3, 'min_child_samples': 60, 'subsample': 0.87, 'learning_rate': 0.016203066373922846, 'max_depth': 83, 'random_state': 42, 'n_jobs': 4}. Best is trial 14 with value: 573.6754876864577.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 251.585	training's rmse: 479.744	valid_1's l1: 293.943	valid_1's rmse: 573.389
Early stopping, best iteration is:
[377]	training's l1: 261.456	training's rmse: 501.649	valid_1's l1: 294.45	valid_1's rmse: 572.492


[32m[I 2023-03-04 02:15:36,693][0m Trial 22 finished with value: 572.4916766502827 and parameters: {'objective': 'rmse', 'n_estimators': 2065, 'reg_alpha': 0.04932348840900192, 'reg_lambda': 0.9111908559905684, 'colsample_bytree': 0.53, 'num_leaves': 686, 'feature_fraction': 0.8053458602336159, 'bagging_fraction': 0.8742150361805683, 'bagging_freq': 3, 'min_child_samples': 45, 'subsample': 0.9, 'learning_rate': 0.015052103642902402, 'max_depth': 84, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 572.4916766502827.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 253.641	training's rmse: 490.515	valid_1's l1: 293.726	valid_1's rmse: 573.283
Early stopping, best iteration is:
[472]	training's l1: 255.662	training's rmse: 494.91	valid_1's l1: 293.761	valid_1's rmse: 573.12


[32m[I 2023-03-04 02:18:09,853][0m Trial 23 finished with value: 573.1195857230181 and parameters: {'objective': 'rmse', 'n_estimators': 4909, 'reg_alpha': 0.9058923653781004, 'reg_lambda': 1.2131094434190008, 'colsample_bytree': 0.53, 'num_leaves': 818, 'feature_fraction': 0.7506988089521185, 'bagging_fraction': 0.8621764181759819, 'bagging_freq': 2, 'min_child_samples': 51, 'subsample': 0.61, 'learning_rate': 0.014664170968324149, 'max_depth': 88, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 572.4916766502827.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 252.988	training's rmse: 484.194	valid_1's l1: 293.906	valid_1's rmse: 573.842
Early stopping, best iteration is:
[478]	training's l1: 254.788	training's rmse: 488.288	valid_1's l1: 293.971	valid_1's rmse: 573.616


[32m[I 2023-03-04 02:20:34,831][0m Trial 24 finished with value: 573.6159579480609 and parameters: {'objective': 'rmse', 'n_estimators': 4924, 'reg_alpha': 1.3195587374078568, 'reg_lambda': 1.6378677034521785, 'colsample_bytree': 0.7300000000000001, 'num_leaves': 812, 'feature_fraction': 0.6673668298531263, 'bagging_fraction': 0.898173944535147, 'bagging_freq': 2, 'min_child_samples': 39, 'subsample': 0.1, 'learning_rate': 0.013681407246365322, 'max_depth': 87, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 572.4916766502827.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 237.35	training's rmse: 404.107	valid_1's l1: 299.131	valid_1's rmse: 582.526
[1000]	training's l1: 202.031	training's rmse: 331.811	valid_1's l1: 296.437	valid_1's rmse: 584.867
Early stopping, best iteration is:
[559]	training's l1: 231.035	training's rmse: 391.876	valid_1's l1: 297.622	valid_1's rmse: 582.162


[32m[I 2023-03-04 02:22:50,000][0m Trial 25 finished with value: 582.1624959078105 and parameters: {'objective': 'rmse', 'n_estimators': 4918, 'reg_alpha': 1.9706249381984708, 'reg_lambda': 0.03398856098263066, 'colsample_bytree': 0.56, 'num_leaves': 838, 'feature_fraction': 0.6701795872764162, 'bagging_fraction': 0.9245609303464793, 'bagging_freq': 2, 'min_child_samples': 3, 'subsample': 0.2, 'learning_rate': 0.010125481228407838, 'max_depth': 89, 'random_state': 42, 'n_jobs': 4}. Best is trial 22 with value: 572.4916766502827.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 256.044	training's rmse: 488.289	valid_1's l1: 293.806	valid_1's rmse: 572.729
Early stopping, best iteration is:
[474]	training's l1: 257.928	training's rmse: 492.547	valid_1's l1: 293.917	valid_1's rmse: 572.45


[32m[I 2023-03-04 02:24:58,886][0m Trial 26 finished with value: 572.4497758136298 and parameters: {'objective': 'rmse', 'n_estimators': 4990, 'reg_alpha': 0.784546444819696, 'reg_lambda': 1.3339706043405715, 'colsample_bytree': 0.7100000000000001, 'num_leaves': 647, 'feature_fraction': 0.764455367408114, 'bagging_fraction': 0.8886513001825234, 'bagging_freq': 2, 'min_child_samples': 43, 'subsample': 0.12000000000000001, 'learning_rate': 0.01361274328633273, 'max_depth': 91, 'random_state': 42, 'n_jobs': 4}. Best is trial 26 with value: 572.4497758136298.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 270.32	training's rmse: 531.976	valid_1's l1: 293.509	valid_1's rmse: 574.357
[1000]	training's l1: 253.931	training's rmse: 497.229	valid_1's l1: 294.118	valid_1's rmse: 575.256
Early stopping, best iteration is:
[615]	training's l1: 265.147	training's rmse: 521.605	valid_1's l1: 292.756	valid_1's rmse: 573.009


[32m[I 2023-03-04 02:27:02,311][0m Trial 27 finished with value: 573.009209369839 and parameters: {'objective': 'rmse', 'n_estimators': 4581, 'reg_alpha': 0.4946400546492129, 'reg_lambda': 2.0107034310483107, 'colsample_bytree': 0.63, 'num_leaves': 639, 'feature_fraction': 0.7675937236189154, 'bagging_fraction': 0.8495192675853548, 'bagging_freq': 1, 'min_child_samples': 96, 'subsample': 0.56, 'learning_rate': 0.013102722135990253, 'max_depth': 72, 'random_state': 42, 'n_jobs': 4}. Best is trial 26 with value: 572.4497758136298.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 266.021	training's rmse: 522.384	valid_1's l1: 292.888	valid_1's rmse: 573.863
[1000]	training's l1: 250.665	training's rmse: 488.534	valid_1's l1: 294.206	valid_1's rmse: 576.267
Early stopping, best iteration is:
[589]	training's l1: 262.101	training's rmse: 514.316	valid_1's l1: 292.634	valid_1's rmse: 573.483


[32m[I 2023-03-04 02:29:04,590][0m Trial 28 finished with value: 573.4830465778551 and parameters: {'objective': 'rmse', 'n_estimators': 4527, 'reg_alpha': 0.07008101717033619, 'reg_lambda': 0.12287355705166156, 'colsample_bytree': 0.63, 'num_leaves': 648, 'feature_fraction': 0.7819639099452385, 'bagging_fraction': 0.935076590504363, 'bagging_freq': 1, 'min_child_samples': 101, 'subsample': 0.33999999999999997, 'learning_rate': 0.013863053637313024, 'max_depth': 75, 'random_state': 42, 'n_jobs': 4}. Best is trial 26 with value: 572.4497758136298.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 270.46	training's rmse: 536.258	valid_1's l1: 293.8	valid_1's rmse: 577.429
Early stopping, best iteration is:
[452]	training's l1: 272.135	training's rmse: 539.703	valid_1's l1: 293.736	valid_1's rmse: 577.545


[32m[I 2023-03-04 02:31:05,790][0m Trial 29 finished with value: 577.5445677285253 and parameters: {'objective': 'rmse', 'n_estimators': 3434, 'reg_alpha': 0.4108644512588926, 'reg_lambda': 0.010299997453208483, 'colsample_bytree': 0.77, 'num_leaves': 581, 'feature_fraction': 0.648024018884061, 'bagging_fraction': 0.687954259929333, 'bagging_freq': 5, 'min_child_samples': 135, 'subsample': 0.44000000000000006, 'learning_rate': 0.021057885642081588, 'max_depth': 73, 'random_state': 42, 'n_jobs': 4}. Best is trial 26 with value: 572.4497758136298.[0m
[32m[I 2023-03-04 02:31:05,801][0m A new study created in memory with name: no-name-4411830d-9d95-451c-85cd-c2a57cb6c088[0m


Number of finished trials: 30
Best LGBM trial parameters: {'objective': 'rmse', 'n_estimators': 4990, 'reg_alpha': 0.784546444819696, 'reg_lambda': 1.3339706043405715, 'colsample_bytree': 0.7100000000000001, 'num_leaves': 647, 'feature_fraction': 0.764455367408114, 'bagging_fraction': 0.8886513001825234, 'bagging_freq': 2, 'min_child_samples': 43, 'subsample': 0.12000000000000001, 'learning_rate': 0.01361274328633273, 'max_depth': 91, 'random_state': 42, 'n_jobs': 4}
Best score: 572.4497758136298


[32m[I 2023-03-04 02:31:08,480][0m Trial 0 finished with value: 586.8308616353585 and parameters: {'learning_rate': 0.9631922337277982, 'l2_leaf_reg': 8.054553418221587, 'bagging_temperature': 0.1760070626095971, 'random_strength': 1.45612906355611, 'depth': 5, 'min_data_in_leaf': 204}. Best is trial 0 with value: 586.8308616353585.[0m
[32m[I 2023-03-04 02:31:10,414][0m Trial 1 finished with value: 684.8699820545496 and parameters: {'learning_rate': 0.695193217005486, 'l2_leaf_reg': 95.79444118891156, 'bagging_temperature': 0.2877538443927507, 'random_strength': 1.3147342833494546, 'depth': 2, 'min_data_in_leaf': 101}. Best is trial 0 with value: 586.8308616353585.[0m
[32m[I 2023-03-04 02:31:13,332][0m Trial 2 finished with value: 602.734676091085 and parameters: {'learning_rate': 0.3374083719702761, 'l2_leaf_reg': 40.296009712549846, 'bagging_temperature': 1.258238963532634, 'random_strength': 1.1531109689935155, 'depth': 6, 'min_data_in_leaf': 185}. Best is trial 0 with value

Number of finished trials: 30
Best Cat trial parameters: {'learning_rate': 0.28842116798866463, 'l2_leaf_reg': 2.7111843076784496, 'bagging_temperature': 0.7574433226694433, 'random_strength': 1.9026999514695244, 'depth': 9, 'min_data_in_leaf': 233}
Best score: 574.577905669467
CPU times: user 2h 3min 15s, sys: 12min 6s, total: 2h 15min 21s
Wall time: 1h 27min 23s


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [30]:
train = create_folds(train, Config.N_FOLDS)
# train = create_strat_folds(train, TARGET, Config.N_FOLDS)

n_folds=5, seed=42


In [31]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index(ID, inplace=True)
oof.head()

Unnamed: 0_level_0,price,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,13619,1
1.0,13387,2
2.0,2772,3
3.0,666,2
4.0,14453,0


In [32]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [33]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [34]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [35]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 14 µs, sys: 0 ns, total: 14 µs
Wall time: 17.6 µs


In [36]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params3 = {
    "n_estimators": Config.N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}
    
lgbm_params = gpu_ify_lgbm(lgbm_params)

In [37]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror", # Normal dist
#     "objective": "reg:gamma", # Gamma dist
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params3 = {
    'n_estimators': Config.N_ESTIMATORS,
    'learning_rate': 0.05,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror'
}

xgb_params_gamma = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:gamma", # "reg:gamma", "reg:squarederror"
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}
if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [38]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)

In [39]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),
    "xgb3": xgb.XGBRegressor(**xgb_params3),
    "xgb_params_gamma": xgb.XGBRegressor(**xgb_params_gamma),

    "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm0": lgb.LGBMRegressor(),
    "lgbm3": lgb.LGBMRegressor(lgbm_params3),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),
}

## Tree Models

In [40]:
%%time

# model_lst = ["xgb3","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
model_lst = ["xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb_params_gamma", "xgb3", "xgb1", "xgb2", "lgbm0", "lgbm1", "lgbm2", "lgbm3", "cat1", "cat2"]
# model_lst = = []
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=xgb_best_params
{}
fold: 1, Score: 294.1201998863048, Run Time: 18.21
fold: 2, Score: 290.1304295434297, Run Time: 19.63
fold: 3, Score: 290.32007349423066, Run Time: 20.30
fold: 4, Score: 291.9801989274606, Run Time: 23.44
fold: 5, Score: 297.5847824185884, Run Time: 23.36
Scores -> Adjusted: 290.04929222 , mean: 292.82713685, std: 2.77784464

=== Model Feature Importance ===
y 0.43886214
carat 0.21556789
x 0.090292156
z 0.033733465
clarity_SI2 0.028149271
color_J 0.02659148
clarity_I1 0.024543898
clarity_VVS2 0.021163555
clarity_SI1 0.018161416
color_I 0.017798143
clarity_VVS1 0.012982401
color_H 0.011187059
clarity_VS1 0.0111757135
clarity_IF 0.011095275
color_F 0.007819594
clarity_VS2 0.007238352
color_E 0.007163059
color_D 0.0063902694
color_G 0.004671726
cut_Ideal 0.0019277955
cut_Fair 0.0008524018
depth 0.0006639135
is_original 0.0005414914
table 0.00048607073
cut_Premium 0.00038002737
cut_Very Good 0.00029432314
cut_Good 0.00026703018


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0.0,13762.2793
1.0,12356.74121
2.0,2843.24146
3.0,708.65704
4.0,14887.68262


Mode
=== Target Value Counts ===
Model Run Time: 109.16
Model=lgbm_best_params
{}
fold: 1, Score: 314.9758571328582, Run Time: 1112.43
fold: 2, Score: 309.7501149041017, Run Time: 1114.42
fold: 3, Score: 313.5752536359352, Run Time: 1117.96
fold: 4, Score: 314.05327151189584, Run Time: 1099.12
fold: 5, Score: 317.89238467785293, Run Time: 1111.25
Scores -> Adjusted: 311.42885978 , mean: 314.04937637, std: 2.62051660

=== Model Feature Importance ===
depth 0.1843653940966561
y 0.15298609691174522
x 0.15087592031283112
z 0.14676819674379366
carat 0.10664691666307266
table 0.0896804886698627
clarity_SI1 0.016377216794660053
cut_Premium 0.014146033156549288
clarity_SI2 0.013715372206792963
color_G 0.013269507808522185
clarity_VS2 0.012820850940116098
cut_Ideal 0.01253043404604411
color_H 0.01183045486546034
cut_Very Good 0.009401936671121174
color_F 0.008560782611474633
color_I 0.00841278169430333
color_E 0.007727695687774535
clarity_VS1 0.0076910833015560575
is_original 0.0069994815313782

Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0.0,13264.41015
1.0,14224.49895
2.0,2815.32838
3.0,677.108
4.0,15423.14499


Mode
=== Target Value Counts ===
Model Run Time: 5560.02
Model=cat_best_params
{}
fold: 1, Score: 299.366871922689, Run Time: 39.11
fold: 2, Score: 296.16261572930995, Run Time: 41.35
fold: 3, Score: 295.9606859067408, Run Time: 41.56
fold: 4, Score: 298.01406959459933, Run Time: 44.07
fold: 5, Score: 301.29225826173524, Run Time: 45.38
Scores -> Adjusted: 296.15353852 , mean: 298.15930028, std: 2.00576177

=== Model Feature Importance ===
y 0.2486942075466028
carat 0.23030064796383376
z 0.10758446547479472
x 0.09122022151492021
clarity_SI2 0.08890996296836357
color_J 0.042406630868047177
color_I 0.03759282286552758
clarity_SI1 0.03603827198361834
clarity_VVS2 0.019090853149046738
color_H 0.01738349638944037
clarity_I1 0.013703524681989463
clarity_VVS1 0.01140334440019902
color_D 0.009065394398367118
color_E 0.008456842637524392
clarity_VS1 0.008259744541598132
clarity_IF 0.006854306294152699
color_F 0.006284398257318293
depth 0.0054958285792530565
table 0.0029708571823599153
cut_Ideal

Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0.0,13492.85059
1.0,12483.10408
2.0,2888.63882
3.0,713.93876
4.0,14772.89468


Mode
=== Target Value Counts ===
Model Run Time: 215.74
Model=xgb_params_gamma
{}
fold: 1, Score: 3898.0606449563315, Run Time: 5.98
fold: 2, Score: 3849.514816166863, Run Time: 7.76
fold: 3, Score: 3899.194363449622, Run Time: 9.87
fold: 4, Score: 3901.365350579137, Run Time: 11.03
fold: 5, Score: 3922.379299632488, Run Time: 11.95
Scores -> Adjusted: 3870.09463834 , mean: 3894.10289496, std: 24.00825662

=== Model Feature Importance ===
carat 0.33110264
y 0.29750657
x 0.24287269
z 0.06549757
clarity_SI1 0.009474104
clarity_SI2 0.007550494
clarity_IF 0.00572446
color_I 0.005653258
color_H 0.0055321204
clarity_VVS1 0.0054904507
color_G 0.003685114
color_J 0.0036021525
clarity_VVS2 0.0025136047
clarity_VS1 0.0023223537
color_E 0.0023187047
color_D 0.0022153168
clarity_VS2 0.0021623692
cut_Very Good 0.0012833503
cut_Good 0.0011209853
clarity_I1 0.00085408404
depth 0.0008469944
color_F 0.0006705894
cut_Premium 0.0
cut_Ideal 0.0
cut_Fair 0.0
is_original 0.0
table 0.0


Unnamed: 0_level_0,pred_xgb_params_gamma
id,Unnamed: 1_level_1
0.0,73.71144
1.0,73.78519
2.0,72.22537
3.0,67.37559
4.0,73.77097


Mode
=== Target Value Counts ===
Model Run Time: 50.54
Model=xgb3
{}
fold: 1, Score: 293.9553247403797, Run Time: 41.99
fold: 2, Score: 290.14835049553847, Run Time: 43.40
fold: 3, Score: 291.14316505212423, Run Time: 45.42
fold: 4, Score: 292.3973887933678, Run Time: 46.74
fold: 5, Score: 297.8757010282207, Run Time: 47.69
Scores -> Adjusted: 290.39940023 , mean: 293.10398602, std: 2.70458579

=== Model Feature Importance ===
y 0.43240333
carat 0.18689513
z 0.08496361
clarity_SI2 0.057773497
clarity_SI1 0.035919834
x 0.026572527
color_J 0.022003427
clarity_VVS2 0.019483283
color_I 0.017445471
clarity_I1 0.017142376
clarity_VS1 0.015768565
clarity_VS2 0.013745308
color_H 0.011543305
clarity_VVS1 0.011440151
clarity_IF 0.010974195
color_G 0.008150793
color_F 0.008114331
color_D 0.0061891032
color_E 0.0060649826
cut_Ideal 0.0035599978
cut_Premium 0.0006901788
cut_Fair 0.0006765477
depth 0.0005626726
table 0.00054979615
is_original 0.0005249471
cut_Good 0.0004448577
cut_Very Good 0.000397

Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0.0,13612.75195
1.0,12763.50098
2.0,2853.98657
3.0,679.80695
4.0,14959.10059


Mode
=== Target Value Counts ===
Model Run Time: 229.22
Model=xgb1
{}
fold: 1, Score: 300.261136499225, Run Time: 24.04
fold: 2, Score: 297.09857987114526, Run Time: 24.75
fold: 3, Score: 297.1217139446997, Run Time: 26.34
fold: 4, Score: 300.4263153978607, Run Time: 29.09
fold: 5, Score: 305.8266371373839, Run Time: 29.50
Scores -> Adjusted: 296.95957228 , mean: 300.14687657, std: 3.18730429

=== Model Feature Importance ===
y 0.44432917
carat 0.189247
clarity_SI2 0.060984682
clarity_VVS2 0.04688364
clarity_SI1 0.043726966
color_J 0.040509053
clarity_I1 0.031042345
color_I 0.02868953
clarity_VVS1 0.020526055
color_H 0.012832618
clarity_IF 0.011939833
color_E 0.009981424
clarity_VS2 0.00914228
z 0.008043429
color_D 0.008023813
clarity_VS1 0.008016802
color_F 0.006049277
cut_Ideal 0.0052890666
x 0.0049167485
color_G 0.00472682
cut_Fair 0.0011309641
cut_Good 0.0007778143
cut_Premium 0.00071043696
depth 0.00068980316
is_original 0.00064850535
cut_Very Good 0.00057741825
table 0.0005644214

Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,13991.81445
1.0,12913.0957
2.0,2835.02319
3.0,707.76874
4.0,14808.0752


Mode
=== Target Value Counts ===
Model Run Time: 137.95
Model=xgb2
{}
fold: 1, Score: 294.15562369963266, Run Time: 6.01
fold: 2, Score: 291.277911813454, Run Time: 6.91
fold: 3, Score: 292.01654974398275, Run Time: 8.47
fold: 4, Score: 293.13299444271587, Run Time: 10.47
fold: 5, Score: 298.64995591906927, Run Time: 11.54
Scores -> Adjusted: 291.25337749 , mean: 293.84660712, std: 2.59322963

=== Model Feature Importance ===
y 0.45479876
carat 0.18637744
clarity_SI2 0.06844441
x 0.044280816
clarity_SI1 0.03406336
clarity_VVS2 0.02556466
color_J 0.023295663
clarity_I1 0.019355468
color_I 0.017065585
clarity_VS2 0.01701653
clarity_VS1 0.015333255
clarity_VVS1 0.0143781165
z 0.012738794
clarity_IF 0.011985934
color_F 0.0109004965
color_H 0.010716295
color_G 0.009897274
color_E 0.0096010985
color_D 0.007891184
cut_Ideal 0.0029659434
cut_Premium 0.00057467626
cut_Fair 0.00050951843
is_original 0.0004969559
table 0.0004886386
depth 0.00047096738
cut_Good 0.000426837
cut_Very Good 0.00036131

Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,13773.1084
1.0,12376.01758
2.0,2831.91333
3.0,687.37469
4.0,14881.08301


Mode
=== Target Value Counts ===
Model Run Time: 47.32
Model=lgbm0
{}
fold: 1, Score: 301.81822778522013, Run Time: 4.35
fold: 2, Score: 297.9886424990666, Run Time: 5.66
fold: 3, Score: 299.19160103771475, Run Time: 7.08
fold: 4, Score: 299.76294967145043, Run Time: 9.03
fold: 5, Score: 305.60606462705243, Run Time: 9.87
Scores -> Adjusted: 298.20238546 , mean: 300.87349712, std: 2.67111166

=== Model Feature Importance ===
carat 0.11066666666666666
y 0.09866666666666667
z 0.07966666666666666
x 0.06966666666666667
clarity_SI2 0.060333333333333336
depth 0.058333333333333334
color_J 0.051
clarity_SI1 0.044
color_I 0.04033333333333333
color_D 0.037
clarity_I1 0.035333333333333335
color_H 0.031
color_E 0.03
clarity_VVS1 0.028666666666666667
clarity_IF 0.028333333333333332
clarity_VS2 0.02666666666666667
table 0.026
clarity_VS1 0.025333333333333333
color_F 0.024666666666666667
color_G 0.023
is_original 0.022
clarity_VVS2 0.021
cut_Ideal 0.011333333333333334
cut_Fair 0.006
cut_Premium 0.005

Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0.0,13878.21112
1.0,12348.72859
2.0,2815.0948
3.0,687.40082
4.0,14898.97732


Mode
=== Target Value Counts ===
Model Run Time: 40.33
Model=lgbm1
{}
fold: 1, Score: 297.3174586707234, Run Time: 15.24
fold: 2, Score: 294.63096874473155, Run Time: 10.07
fold: 3, Score: 294.7490123787296, Run Time: 12.61
fold: 4, Score: 296.1903796403899, Run Time: 13.53
fold: 5, Score: 301.8428073042293, Run Time: 15.38
Scores -> Adjusted: 294.30514865 , mean: 296.94612535, std: 2.64097670

=== Model Feature Importance ===
carat 0.153125
y 0.14703125
z 0.131875
depth 0.116875
x 0.11671875
table 0.0559375
clarity_SI2 0.03078125
color_J 0.02359375
clarity_SI1 0.0215625
color_I 0.02140625
color_H 0.018125
color_G 0.01390625
color_D 0.01390625
is_original 0.0134375
color_F 0.0134375
clarity_VS2 0.01265625
color_E 0.0125
clarity_VS1 0.01234375
cut_Premium 0.011875
cut_Ideal 0.01015625
clarity_IF 0.0096875
clarity_VVS2 0.009375
clarity_VVS1 0.00890625
cut_Very Good 0.00828125
clarity_I1 0.00828125
cut_Good 0.00234375
cut_Fair 0.001875


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,13620.89036
1.0,12412.74697
2.0,2767.30541
3.0,679.72866
4.0,14789.4429


Mode
=== Target Value Counts ===
Model Run Time: 70.86
Model=lgbm2
{}
fold: 1, Score: 481.1366234502266, Run Time: 3.18
fold: 2, Score: 470.6337274541548, Run Time: 5.10
fold: 3, Score: 480.0217206321727, Run Time: 5.91
fold: 4, Score: 478.23042049303217, Run Time: 8.00
fold: 5, Score: 482.3914465108882, Run Time: 8.58
Scores -> Adjusted: 474.32682312 , mean: 478.48278771, std: 4.15596459

=== Model Feature Importance ===
z 0.164
x 0.111
color_I 0.084
y 0.075
cut_Ideal 0.058
clarity_SI2 0.048
clarity_VS1 0.047
clarity_VVS2 0.045
clarity_VVS1 0.041
color_E 0.04
carat 0.038
clarity_SI1 0.036
depth 0.034
clarity_I1 0.027
clarity_VS2 0.026
clarity_IF 0.02
table 0.019
color_J 0.018
color_H 0.017
color_G 0.015
color_D 0.014
color_F 0.012
cut_Very Good 0.004
is_original 0.003
cut_Fair 0.002
cut_Premium 0.002
cut_Good 0.0


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,11709.42353
1.0,14352.70518
2.0,2740.57168
3.0,895.77281
4.0,13894.80972


Mode
=== Target Value Counts ===
Model Run Time: 34.79
Model=lgbm3
{}
fold: 1, Score: 297.3169758627501, Run Time: 9.36
fold: 2, Score: 294.6304776586671, Run Time: 9.71
fold: 3, Score: 294.7497961401588, Run Time: 11.55
fold: 4, Score: 296.1903795042576, Run Time: 13.49
fold: 5, Score: 301.83878523010657, Run Time: 14.39
Scores -> Adjusted: 294.30585547 , mean: 296.94528288, std: 2.63942741

=== Model Feature Importance ===
carat 0.153125
y 0.14703125
z 0.13171875
depth 0.116875
x 0.116875
table 0.0559375
clarity_SI2 0.03078125
color_J 0.02359375
clarity_SI1 0.0215625
color_I 0.02140625
color_H 0.018125
color_G 0.01390625
color_D 0.01390625
is_original 0.0134375
color_F 0.0134375
clarity_VS2 0.01265625
color_E 0.0125
clarity_VS1 0.01234375
cut_Premium 0.011875
cut_Ideal 0.01015625
clarity_IF 0.0096875
clarity_VVS2 0.009375
clarity_VVS1 0.00890625
cut_Very Good 0.00828125
clarity_I1 0.00828125
cut_Good 0.00234375
cut_Fair 0.001875


Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0.0,13620.89037
1.0,12412.74691
2.0,2767.30543
3.0,679.72867
4.0,14789.44291


Mode
=== Target Value Counts ===
Model Run Time: 62.71
Model=cat1
{}
fold: 1, Score: 297.72296419318485, Run Time: 26.13
fold: 2, Score: 291.6784022678925, Run Time: 27.79
fold: 3, Score: 291.6128145710852, Run Time: 29.06
fold: 4, Score: 295.7730765160127, Run Time: 30.94
fold: 5, Score: 300.35632376270496, Run Time: 31.46
Scores -> Adjusted: 292.01435149 , mean: 295.42871626, std: 3.41436477

=== Model Feature Importance ===
x 0.2530733423065527
y 0.17857548223168243
carat 0.17761405168248673
z 0.12402791876514688
clarity_SI2 0.07761358476545545
color_J 0.03499208068243589
clarity_SI1 0.032191704053607495
color_I 0.02955532674473758
clarity_VVS2 0.015774120262406384
color_H 0.015611113786326365
clarity_I1 0.012255643031152844
clarity_VVS1 0.008365661793805232
color_D 0.007170403591097322
color_E 0.006425951555025823
clarity_VS1 0.00618564692780601
clarity_IF 0.0057802683485366005
color_F 0.004895962636701908
depth 0.002275949668450936
cut_Ideal 0.0017033031362601223
clarity_VS2 0.001

Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,13674.64029
1.0,12508.3531
2.0,2873.14491
3.0,703.9258
4.0,14802.73695


Mode
=== Target Value Counts ===
Model Run Time: 149.73
Model=cat2
{}
fold: 1, Score: 319.5716960088878, Run Time: 85.43
fold: 2, Score: 315.53009823735727, Run Time: 4.78
fold: 3, Score: 316.54124577095706, Run Time: 6.13
fold: 4, Score: 319.8212963767485, Run Time: 8.59
fold: 5, Score: 322.9397193116316, Run Time: 8.63
Scores -> Adjusted: 316.25260213 , mean: 318.88081114, std: 2.62820901

=== Model Feature Importance ===
y 0.2396729137081466
carat 0.14630771264688647
z 0.13836182323958687
clarity_SI2 0.11513913709879933
x 0.085144322541499
color_J 0.0550207968375544
clarity_SI1 0.04729547287581312
color_I 0.04581085554042531
color_H 0.02189404762716318
clarity_VVS2 0.02049723109739917
clarity_I1 0.0175743588024976
clarity_VVS1 0.011564945146463682
color_D 0.011314144390975157
color_E 0.010379479637307776
clarity_VS1 0.00866224540139778
color_F 0.00840503672897158
clarity_IF 0.007715691449493301
color_G 0.0034420462976239396
cut_Ideal 0.0024900439916045753
depth 0.001713099554623829


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,13386.04856
1.0,12542.78712
2.0,2834.98344
3.0,761.96224
4.0,14372.34794


Mode
=== Target Value Counts ===
Model Run Time: 117.67
CPU times: user 3h 12min 57s, sys: 12min 32s, total: 3h 25min 29s
Wall time: 1h 53min 46s


Unnamed: 0,Model,Score,StdDev,RunTime
3,xgb_params_gamma,3894.10289,24.00826,50.54402
9,lgbm2,478.48279,4.15596,34.78863
12,cat2,318.88081,2.62821,117.67013
1,lgbm_best_params,314.04938,2.62052,5560.02003
7,lgbm0,300.8735,2.67111,40.32963
5,xgb1,300.14688,3.1873,137.95487
2,cat_best_params,298.1593,2.00576,215.73798
8,lgbm1,296.94613,2.64098,70.85603
10,lgbm3,296.94528,2.63943,62.70788
11,cat1,295.42872,3.41436,149.72717


## Linear Models

In [41]:
model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
model_lst = ["lasso", "ridge",  "ridge_50"]
# model_lst = []
# all_cv_scores = run_models4features(model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

all_cv_scores.head()

Model=lasso
fold: 1, Score: 643.7632608971782, Run Time: 4.73
fold: 2, Score: 638.5058185741259, Run Time: 5.51
fold: 3, Score: 645.5589170948598, Run Time: 7.82
fold: 4, Score: 644.6071047344354, Run Time: 9.26
fold: 5, Score: 640.2022445297636, Run Time: 10.86
Scores -> Adjusted: 639.82110315 , mean: 642.52746917, std: 2.70636601


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,11371.10932
1.0,13266.83948
2.0,3577.55091
3.0,960.97536
4.0,12896.54654


Mode
=== Target Value Counts ===
Model Run Time: 42.37
Model=ridge
fold: 1, Score: 643.5668346413521, Run Time: 1.30
fold: 2, Score: 638.4624429630502, Run Time: 2.82
fold: 3, Score: 645.3704922937586, Run Time: 4.45
fold: 4, Score: 644.4580188631362, Run Time: 6.20
fold: 5, Score: 640.2129918663342, Run Time: 6.66
Scores -> Adjusted: 639.77949762 , mean: 642.41415613, std: 2.63465851


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,11367.43003
1.0,13284.75178
2.0,3572.58221
3.0,972.38309
4.0,12908.35637


Mode
=== Target Value Counts ===
Model Run Time: 25.61
Model=ridge_50
fold: 1, Score: 643.5559094214277, Run Time: 1.27
fold: 2, Score: 638.4514138275706, Run Time: 2.64
fold: 3, Score: 645.3596516273961, Run Time: 4.08
fold: 4, Score: 644.4472570894835, Run Time: 6.57
fold: 5, Score: 640.2023042180139, Run Time: 6.74
Scores -> Adjusted: 639.76861292 , mean: 642.40330724, std: 2.63469432


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,11367.73494
1.0,13285.11925
2.0,3572.51496
3.0,972.53727
4.0,12908.4995


Mode
=== Target Value Counts ===
Model Run Time: 25.31


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb_best_params,292.82714,2.77784,109.16344
1,lgbm_best_params,314.04938,2.62052,5560.02003
2,cat_best_params,298.1593,2.00576,215.73798
3,xgb_params_gamma,3894.10289,24.00826,50.54402
4,xgb3,293.10399,2.70459,229.21891


In [42]:
sample_submission.head(20)

Unnamed: 0,id,price,target_xgb_best_params,target_lgbm_best_params,target_cat_best_params,target_xgb_params_gamma,target_xgb3,target_xgb1,target_xgb2,target_lgbm0,target_lgbm1,target_lgbm2,target_lgbm3,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,193573,3969.155,869.698,862.84825,877.49311,67.889,863.24261,854.5777,862.88324,844.24004,828.44948,829.53492,828.44949,868.52349,897.55326,1124.47124,1128.32029,1128.42433
1,193574,3969.155,2478.49609,2403.49433,2553.81628,72.39225,2446.89233,2429.01392,2451.67285,2517.56192,2516.44411,2642.13246,2516.44413,2510.52047,2537.17215,2338.33009,2328.24324,2328.14518
2,193575,3969.155,2304.28687,2098.42567,2277.91738,72.2181,2283.91919,2290.60376,2314.21143,2312.94226,2247.34908,2463.50194,2247.34915,2242.43196,2321.20291,2276.93179,2260.22677,2260.00429
3,193576,3969.155,823.59064,884.00077,798.36989,68.04408,823.10901,837.1922,822.96832,830.24297,838.36446,872.48554,838.36443,834.8684,847.65865,1288.785,1298.04216,1298.11253
4,193577,3969.155,5688.43604,5747.58888,5608.50994,73.38731,5757.1499,5573.97803,5763.28418,5695.66514,5535.9447,5719.79569,5535.94476,5652.01644,5806.48923,6795.60765,6784.09533,6784.00964
5,193578,3969.155,690.17584,748.73395,697.1984,67.27662,684.28058,727.43109,684.68817,648.79182,702.79588,992.67523,702.79588,720.89036,717.35216,600.64509,611.00995,611.10975
6,193579,3969.155,12376.19043,12718.32279,12275.0795,73.70206,12292.58398,12324.79297,12210.69043,12226.73699,12317.65154,11618.073,12317.65154,12253.93612,12241.94859,11053.27722,11058.11016,11058.20624
7,193580,3969.155,2954.94629,2995.20987,2951.60553,72.27367,2892.91821,2878.87354,2924.43726,2894.70938,2925.28725,2728.58745,2925.28724,2944.63318,2915.9979,3524.39489,3516.87527,3516.75697
8,193581,3969.155,15578.22168,14203.80021,15464.3968,73.78304,14952.72168,14781.29004,15585.53809,14917.44616,14873.12446,14525.17098,14873.1244,15511.43232,15339.23239,15537.79086,15545.98746,15546.49174
9,193582,3969.155,1906.15674,1872.08147,1784.22501,71.18585,1843.74231,1848.28992,1846.95813,1959.34844,1869.86673,1900.15767,1869.86674,1773.80944,1908.17055,2245.41956,2239.28853,2239.40512


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [43]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [44]:
model_lst

['lasso', 'ridge', 'ridge_50']

In [45]:
model_lst = ["xgb1", "xgb2", "cat1", "lgbm0", "lgbm1"]

In [46]:
len(model_lst)

5

In [47]:
target_names = [f"target_{model}" for model in model_lst]
target_names

['target_xgb1', 'target_xgb2', 'target_cat1', 'target_lgbm0', 'target_lgbm1']

In [48]:
sample_submission[TARGET] = sample_submission[target_names].sum(axis=1) / len(model_lst)

In [49]:
sample_submission[[ID, TARGET]].to_csv("submission_models_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,price
129042,322615,2909.58013
129043,322616,654.64475
129044,322617,4075.96364
129045,322618,3835.98352
129046,322619,2500.6376
129047,322620,7592.65518
129048,322621,5358.86373
129049,322622,4132.83593


In [50]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 3 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

# sample_submission[TARGET] = sample_submission[TARGET].astype(int)

In [51]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,price
129042,322615,2841.6476
129043,322616,663.57607
129044,322617,4100.38075
129045,322618,3830.50236
129046,322619,2491.78892
129047,322620,7633.52296
129048,322621,5484.56148
129049,322622,4118.39727


In [52]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
3,xgb_params_gamma,3894.10289,24.00826,50.54402
13,lasso,642.52747,2.70637,42.37215
14,ridge,642.41416,2.63466,25.61062
15,ridge_50,642.40331,2.63469,25.31384
9,lgbm2,478.48279,4.15596,34.78863
12,cat2,318.88081,2.62821,117.67013
1,lgbm_best_params,314.04938,2.62052,5560.02003
7,lgbm0,300.8735,2.67111,40.32963
5,xgb1,300.14688,3.1873,137.95487
2,cat_best_params,298.1593,2.00576,215.73798


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [53]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [54]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [55]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
    id    pred_cat1
0  0.0  13674.64029
1  1.0  12508.35310
2  2.0   2873.14491
3  3.0    703.92580
4  4.0  14802.73695
Processing train_pred_cat2, train_pred_cat2.csv
    id    pred_cat2
0  0.0  13386.04856
1  1.0  12542.78712
2  2.0   2834.98344
3  3.0    761.96224
4  4.0  14372.34794
Processing train_pred_lgbm1, train_pred_lgbm1.csv
    id   pred_lgbm1
0  0.0  13620.89036
1  1.0  12412.74697
2  2.0   2767.30541
3  3.0    679.72866
4  4.0  14789.44290
Processing train_pred_lgbm2, train_pred_lgbm2.csv
    id   pred_lgbm2
0  0.0  11709.42353
1  1.0  14352.70518
2  2.0   2740.57168
3  3.0    895.77281
4  4.0  13894.80972
Processing train_pred_xgb1, train_pred_xgb1.csv
    id    pred_xgb1
0  0.0  13991.81400
1  1.0  12913.09600
2  2.0   2835.02320
3  3.0    707.76874
4  4.0  14808.07500
submission_cat1, submission_cat1.csv
       id       price
0  193573   868.52349
1  193574  2510.52047
2  193575  2242.43196
3  193576   834.86840
4  193577

In [56]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,13674.64029,13386.04856,13620.89036,11709.42353,13991.814
1,12508.3531,12542.78712,12412.74697,14352.70518,12913.096
2,2873.14491,2834.98344,2767.30541,2740.57168,2835.0232
3,703.9258,761.96224,679.72866,895.77281,707.76874
4,14802.73695,14372.34794,14789.4429,13894.80972,14808.075


In [57]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,868.52349,897.55326,828.44948,829.53492,854.5777
1,2510.52047,2537.17215,2516.44411,2642.13246,2429.014
2,2242.43196,2321.20291,2247.34908,2463.50194,2290.6038
3,834.8684,847.65865,838.36446,872.48554,837.1922
4,5652.01644,5806.48923,5535.9447,5719.79569,5573.978


In [58]:
type(preds_df)

pandas.core.frame.DataFrame

In [59]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [60]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [61]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,13674.64029,13386.04856,13620.89036,11709.42353,13991.814
1,12508.3531,12542.78712,12412.74697,14352.70518,12913.096
2,2873.14491,2834.98344,2767.30541,2740.57168,2835.0232
3,703.9258,761.96224,679.72866,895.77281,707.76874
4,14802.73695,14372.34794,14789.4429,13894.80972,14808.075


In [62]:
# preds_df[useful_features].head()

In [63]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [64]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [65]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
3,xgb_params_gamma,3894.1,24.01,50.54
13,lasso,642.53,2.71,42.37
14,ridge,642.41,2.63,25.61
15,ridge_50,642.4,2.63,25.31
9,lgbm2,478.48,4.16,34.79
12,cat2,318.88,2.63,117.67
1,lgbm_best_params,314.05,2.62,5560.02
7,lgbm0,300.87,2.67,40.33
5,xgb1,300.15,3.19,137.95
2,cat_best_params,298.16,2.01,215.74
