<a href="https://www.kaggle.com/code/mmellinger66/s3e8-gemstone-pricing-models?scriptVersionId=120984796" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 8 - Gemstone Pricing Models</h1>
</div>

## Problem Type

Regression

## Evaluation Metric

$$RMSE = \sqrt{\frac{1}{N} \sum_{i=1}^N (y_i - \hat{y_i})^2}$$

```python
score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
```

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="price"
ID="id"

In [3]:
class Config:
    path:str = "../input/playground-series-s3e8/"
    gpu:bool = True
    optimize:bool = True
    n_optuna_trials:int = 2 # 5, 10, 30
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 100  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    """Deprecated: Use summary_statistics()"""
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    if Config.gpu:
        lgbm_dict["device"] = "gpu"
        lgbm_dict["boosting_type"] = "gbdt"
        lgbm_dict["gpu_platform_id"] = 0
        lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    if Config.gpu:
        params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
#         "eval_metric": "rmse",  # auc, rmse, mae
        "eval_metric": trial.suggest_categorical("eval_metric", ["rmse", "mae"]),
        "objective": trial.suggest_categorical("objective", ["reg:squarederror"]), # "reg:squarederror",
        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["gpu_hist"]
        ),  # hist, gpu_hist
        "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.mean_squared_error(y_valid, oof, squared=False)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    lgbm_params = {
        "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 5000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = lgb.LGBMRegressor(**lgbm_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)


def objective_clf_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)


def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 100,
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
          "use_best_model": True,
#         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    #  model = CatBoostClassifier(**cb_params)
    model = cb.CatBoostRegressor(**cb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

#     print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification
    return metrics.mean_squared_error(y_valid, oof, squared=False)
#     return metrics.mean_absolute_error(y_valid, oof)
# 
#     return accuracy_score(y_valid, oof)

def objective_clf_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
#             "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=193573, Columns=11
 test data : Rows=129050, Columns=10
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453



[1m[91m=== Train Column Names ===[0m



Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'price'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price']
[1m[91mCategorical features:[0m ['cut', 'color', 'clarity']

 --- Cardinality of Categorical Features ---

[1m[91mcut[0m: cardinality=5, ['Premium' 'Very Good' 'Ideal' 'Good' 'Fair']
[1m[91mcolor[0m: cardinality=7, ['F' 'J' 'G' 'E' 'D' 'H' 'I']
[1m[91mclarity[0m: cardinality=8, ['VS2' 'SI2' 'VS1' 'SI1' 'IF' 'VVS2' 'VVS1' 'I1']

[1m[91m=== Skewness ===[0m

price    1.60558
carat    0.99513
z        0.68567
table    0.61906
x        0.36105
y        0.35676
id       0.00000
depth   -0.27638
dtype: float64
CPU times: user 337 ms, sys: 64.5 ms, total: 401 ms
Wall time: 732 ms


In [8]:
train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [9]:
original = pd.read_csv("../input/gemstone-price-prediction/cubic_zirconia.csv", index_col=[0])
original = original[-original.depth.isna()]
original.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289
4,0.42,Ideal,F,VS1,61.6,56.0,4.82,4.8,2.96,1082
5,0.31,Ideal,F,VVS1,60.4,59.0,4.35,4.43,2.65,779


In [10]:
original.shape

(26270, 10)

In [11]:
train['is_original']    = 0
test['is_original']     = 0
original['is_original'] = 1
combined = pd.concat([train, original], ignore_index=True).drop_duplicates()
train = combined

In [12]:
combined.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,is_original
0,0.0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,0
1,1.0,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,0
2,2.0,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,0
3,3.0,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,0
4,4.0,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,0


In [13]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
carat,219809.0,0.79,0.46,0.2,0.4,0.7,1.03,4.5,0.22,1.01,0.63
depth,219809.0,61.81,1.13,50.8,61.2,61.9,62.4,73.6,1.27,-0.24,3.07
table,219809.0,57.25,1.96,49.0,56.0,57.0,58.0,79.0,3.84,0.66,1.04
x,219809.0,5.72,1.11,0.0,4.7,5.7,6.52,10.23,1.24,0.36,-0.78
y,219809.0,5.72,1.11,0.0,4.71,5.72,6.51,58.9,1.23,0.85,23.12
z,219809.0,3.53,0.69,0.0,2.9,3.53,4.03,31.3,0.48,0.65,11.15
price,219809.0,3965.19,4032.64,326.0,949.0,2398.0,5405.0,18818.0,16262215.44,1.61,2.11
is_original,219809.0,0.12,0.32,0.0,0.0,0.0,0.0,1.0,0.11,2.35,3.51


## Outlier Detection

In [14]:
# https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
    
def iqr(data:pd.DataFrame, var:str):# outliers detecion .
    q1 = np.quantile(data[var], 0.25)
    q3 = np.quantile(data[var], 0.75)
    diff = q3 - q1
    lower_t = q1 - (1.5 * diff)
    upper_t = q3 + (1.5 * diff)
    return data[(data[var] < lower_t) | (data[var] > upper_t)]

# iqr(train, "squareMeters")

In [15]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy

def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(train)


Unnamed: 0,Outlier_percentage
is_original,11.93582
price,6.53067
depth,4.58989
carat,3.96799
table,2.54721
z,0.0182
x,0.01456
y,0.01319
id,0.0


In [16]:
# https://www.kaggle.com/code/sujithmandala/playground-s3-e8-ensemble-model-98-accuracy
    
def detect_outliers(data:pd.DataFrame) -> pd.DataFrame:
    outlier_percents = {}
    for column in data.columns:
        if data[column].dtype != object:
            q1 = np.quantile(data[column], 0.25)
            q3 = np.quantile(data[column], 0.75)
            iqr = q3 - q1
            upper_bound = q3 + (1.5 * iqr)
            lower_bound = q1 - (1.5 * iqr)
            outliers = data[(data[column] > upper_bound) | (data[column] < lower_bound)][column]
            outlier_percentage = len(outliers) / len(data[column]) * 100
            outlier_percents[column] = outlier_percentage
            outlier_dataframe = pd.DataFrame(data = outlier_percents.values() ,index=outlier_percents.keys() ,columns=['Outlier_percentage'])
    
    return outlier_dataframe.sort_values(by = 'Outlier_percentage', ascending = False)

detect_outliers(test)


Unnamed: 0,Outlier_percentage
depth,5.06083
carat,3.92096
table,2.30918
z,0.01937
x,0.00697
y,0.00697
id,0.0
is_original,0.0


In [17]:
# iqr(train,"floors")

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

In [18]:
# train.drop(['cityCode'], axis=1, inplace=True)
# test.drop(['cityCode'], axis=1, inplace=True)

## Handle Outliers
- https://www.kaggle.com/code/lyasdemir/best-algorithm-for-prediction-xgboost
- https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

In [19]:
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors', 'cityCode', 'squareMeters']
# features_with_outliers = ['attic', 'garage', 'made', 'basement', 'floors',  'squareMeters']

In [20]:
# https://www.kaggle.com/code/mnokno/paris-housing-price-prediction-using-xgboost

def remove_outliers(df:pd.DataFrame) -> pd.DataFrame:
    for c in features_with_outliers:
        if c == 'garage':
            first_percentile = df[c].quantile(0.001)
            df = df[df[c] > first_percentile]

        ninety_ninth_percentile = df[c].quantile(0.999)
        df = df[df[c] < ninety_ninth_percentile]
        #df_t = df_t[(df_t[c] > first_percentile) & (df_t[c] < ninety_ninth_percentile)]
    return df


In [21]:
# print(f'Before: {len(train)}')
# train = remove_outliers(train)
# print(f'After: {len(train)}')

In [22]:
train.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,is_original
0,0.0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,0
1,1.0,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,0
2,2.0,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,0
3,3.0,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,0
4,4.0,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,0
5,5.0,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506,0
6,6.0,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229,0
7,7.0,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224,0
8,8.0,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886,0
9,9.0,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421,0


In [23]:
train = train.reset_index(drop=True).copy()
train.head(10)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price,is_original
0,0.0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619,0
1,1.0,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387,0
2,2.0,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772,0
3,3.0,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666,0
4,4.0,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453,0
5,5.0,1.51,Very Good,J,SI1,62.8,58.0,7.34,7.29,4.59,7506,0
6,6.0,0.74,Ideal,E,VS2,61.8,57.0,5.76,5.79,3.57,3229,0
7,7.0,1.34,Premium,G,SI2,62.5,57.0,7.0,7.05,4.38,6224,0
8,8.0,0.3,Ideal,F,IF,62.0,56.0,4.35,4.37,2.7,886,0
9,9.0,0.3,Good,J,VS1,63.6,57.0,4.26,4.28,2.72,421,0


In [24]:
excluded_features = [TARGET, ID, "fold"]

In [25]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'is_original']

[1m[91mCategorical Features=[0m['cut', 'color', 'clarity']
[1m[91m=== Cardinality ===[0m
cut        5
color      7
clarity    8
dtype: int64


['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'is_original',
 'cut',
 'color',
 'clarity']

In [26]:
# train, test = label_encoder(train, test, cat_features)
train = pd.get_dummies(train,columns=['cut','color','clarity']) # Will remove original feature names
test = pd.get_dummies(test,columns=['cut','color','clarity'])

In [27]:
train.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,is_original,cut_Fair,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0.0,1.52,62.2,58.0,7.27,7.33,4.55,13619,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1.0,2.03,62.0,58.0,8.06,8.12,5.05,13387,0,0,...,0,1,0,0,0,1,0,0,0,0
2,2.0,0.7,61.2,57.0,5.69,5.73,3.5,2772,0,0,...,0,0,0,0,0,0,1,0,0,0
3,3.0,0.32,61.6,56.0,4.38,4.41,2.71,666,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4.0,1.7,62.6,59.0,7.65,7.61,4.77,14453,0,0,...,0,0,0,0,0,0,0,1,0,0


In [28]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'carat', 'depth', 'table', 'x', 'y', 'z', 'price', 'is_original', 'cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium', 'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G', 'color_H', 'color_I', 'color_J', 'clarity_I1', 'clarity_IF', 'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'is_original',
 'cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2']

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization</h1>
</div>

In [29]:
%%time

objective_direction = "minimize"  # minimize, maximize

if Config.optimize:
    y = train[TARGET]
    X = train[FEATURES].copy()

    X_test = test[FEATURES].copy()
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
        X, y, test_size=0.2, random_state=Config.seed
    )

# === XGB ===

time_limit = 3600 * 3
best_xgb_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction)
    study.optimize(
        lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best XGB trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_xgb_params = study.best_trial.params

## === LGBM ===

time_limit = 3600 * 3
best_lgbm_params = {}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction) # minimize, maximize
    study.optimize(
        lambda trial: objective_lgbm(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best LGBM trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_lgbm_params = study.best_trial.params

## === CatBoost

time_limit = 3600 * 3
# best_cb_params = {}
best_cb_params = {'learning_rate': 0.45743264601999495,
                  'l2_leaf_reg': 41.338946049390074,
                  'bagging_temperature': 0.3472567739474319,
                  'random_strength': 1.7332249677756242, 
                  'depth': 1,
                  'min_data_in_leaf': 6}

if Config.optimize:
    study = optuna.create_study(direction=objective_direction) # minimize, maximize
    study.optimize(
        lambda trial: objective_cb(trial, X_train, X_valid, y_train, y_valid),
        n_trials=Config.n_optuna_trials,
        # timeout=time_limit,  # this or n_trials
    )

if Config.optimize:
    print("Number of finished trials:", len(study.trials))
    print("Best Cat trial parameters:", study.best_trial.params)
    print("Best score:", study.best_value)
    best_cb_params = study.best_trial.params

[32m[I 2023-03-04 00:56:01,225][0m A new study created in memory with name: no-name-f34f8133-a810-4424-91d4-ef7eaab4fd25[0m
[32m[I 2023-03-04 00:56:36,930][0m Trial 0 finished with value: 575.6519633704363 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1600, 'learning_rate': 0.09220331193948289, 'subsample': 0.55, 'colsample_bytree': 0.52, 'max_depth': 9, 'gamma': 1.7000000000000002, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 3.7598889365637425e-05, 'reg_alpha': 0.007002819211567355, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.12174826697804277}. Best is trial 0 with value: 575.6519633704363.[0m


Number of boosting rounds: 83


[32m[I 2023-03-04 00:56:43,120][0m Trial 1 finished with value: 638.4441047835744 and parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 2100, 'learning_rate': 0.01984585004023678, 'subsample': 0.7, 'colsample_bytree': 0.8700000000000001, 'max_depth': 5, 'gamma': 62.300000000000004, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 0.00018888717202660398, 'reg_alpha': 0.0013610266529830706, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 658.8287731829346}. Best is trial 0 with value: 575.6519633704363.[0m
[32m[I 2023-03-04 00:56:43,124][0m A new study created in memory with name: no-name-5d4e5412-ea80-44d9-a471-8b308a26a1a0[0m


Number of boosting rounds: 2099
Number of finished trials: 2
Best XGB trial parameters: {'eval_metric': 'rmse', 'objective': 'reg:squarederror', 'use_label_encoder': False, 'n_estimators': 1600, 'learning_rate': 0.09220331193948289, 'subsample': 0.55, 'colsample_bytree': 0.52, 'max_depth': 9, 'gamma': 1.7000000000000002, 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'reg_lambda': 3.7598889365637425e-05, 'reg_alpha': 0.007002819211567355, 'random_state': 42, 'n_jobs': 4, 'min_child_weight': 0.12174826697804277}
Best score: 575.6519633704363
Training until validation scores don't improve for 500 rounds
[500]	training's l1: 291.96	valid_1's l1: 302.373
[1000]	training's l1: 284.578	valid_1's l1: 299.833
[1500]	training's l1: 280.338	valid_1's l1: 298.989
[2000]	training's l1: 277.051	valid_1's l1: 298.633
[2500]	training's l1: 274.459	valid_1's l1: 298.35
[3000]	training's l1: 272.258	valid_1's l1: 298.188
[3500]	training's l1: 270.363	valid_1's l1: 298.211
Early stopping, best iteratio

[32m[I 2023-03-04 01:00:06,383][0m Trial 0 finished with value: 605.4833880300791 and parameters: {'objective': 'mae', 'n_estimators': 4785, 'reg_alpha': 2.785891554018794, 'reg_lambda': 0.07203758150086396, 'colsample_bytree': 0.6200000000000001, 'num_leaves': 351, 'feature_fraction': 0.7541092032437099, 'bagging_fraction': 0.3691098185392664, 'bagging_freq': 4, 'min_child_samples': 265, 'subsample': 0.33999999999999997, 'learning_rate': 0.02449109373119847, 'max_depth': 99, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 605.4833880300791.[0m


Training until validation scores don't improve for 500 rounds
[500]	training's l1: 315.877	training's rmse: 633.151	valid_1's l1: 324.959	valid_1's rmse: 656.779
[1000]	training's l1: 306.45	training's rmse: 615.242	valid_1's l1: 318.61	valid_1's rmse: 644.032
[1500]	training's l1: 301.005	training's rmse: 603.895	valid_1's l1: 315.142	valid_1's rmse: 635.912
[2000]	training's l1: 297.56	training's rmse: 595.556	valid_1's l1: 313.878	valid_1's rmse: 631.211
[2500]	training's l1: 294.826	training's rmse: 589.207	valid_1's l1: 312.91	valid_1's rmse: 627.073
[3000]	training's l1: 292.805	training's rmse: 584.226	valid_1's l1: 312.146	valid_1's rmse: 624.436
[3500]	training's l1: 290.821	training's rmse: 579.645	valid_1's l1: 311.824	valid_1's rmse: 622.367
Did not meet early stopping. Best iteration is:
[3627]	training's l1: 290.444	training's rmse: 578.599	valid_1's l1: 311.647	valid_1's rmse: 621.644


[32m[I 2023-03-04 01:02:26,644][0m Trial 1 finished with value: 621.6439085170022 and parameters: {'objective': 'rmse', 'n_estimators': 3634, 'reg_alpha': 0.00019406787282598976, 'reg_lambda': 0.007105230774213588, 'colsample_bytree': 0.36, 'num_leaves': 101, 'feature_fraction': 0.32051905442111706, 'bagging_fraction': 0.18687108293667598, 'bagging_freq': 3, 'min_child_samples': 278, 'subsample': 0.98, 'learning_rate': 0.06459849171817288, 'max_depth': 30, 'random_state': 42, 'n_jobs': 4}. Best is trial 0 with value: 605.4833880300791.[0m
[32m[I 2023-03-04 01:02:26,647][0m A new study created in memory with name: no-name-b148c09c-baba-4f8b-a6a8-6414dc176d14[0m


Number of finished trials: 2
Best LGBM trial parameters: {'objective': 'mae', 'n_estimators': 4785, 'reg_alpha': 2.785891554018794, 'reg_lambda': 0.07203758150086396, 'colsample_bytree': 0.6200000000000001, 'num_leaves': 351, 'feature_fraction': 0.7541092032437099, 'bagging_fraction': 0.3691098185392664, 'bagging_freq': 4, 'min_child_samples': 265, 'subsample': 0.33999999999999997, 'learning_rate': 0.02449109373119847, 'max_depth': 99, 'random_state': 42, 'n_jobs': 4}
Best score: 605.4833880300791


[32m[I 2023-03-04 01:02:29,201][0m Trial 0 finished with value: 593.7915047962265 and parameters: {'learning_rate': 0.47519106891090757, 'l2_leaf_reg': 14.070141508803138, 'bagging_temperature': 4.158457154571016, 'random_strength': 1.8810943167136995, 'depth': 5, 'min_data_in_leaf': 252}. Best is trial 0 with value: 593.7915047962265.[0m
[32m[I 2023-03-04 01:02:33,245][0m Trial 1 finished with value: 576.139243347044 and parameters: {'learning_rate': 0.21885400853822787, 'l2_leaf_reg': 2.3992501495776275, 'bagging_temperature': 5.783677603931897, 'random_strength': 1.3629223890741367, 'depth': 9, 'min_data_in_leaf': 116}. Best is trial 1 with value: 576.139243347044.[0m


Number of finished trials: 2
Best Cat trial parameters: {'learning_rate': 0.21885400853822787, 'l2_leaf_reg': 2.3992501495776275, 'bagging_temperature': 5.783677603931897, 'random_strength': 1.3629223890741367, 'depth': 9, 'min_data_in_leaf': 116}
Best score: 576.139243347044
CPU times: user 10min 43s, sys: 45.7 s, total: 11min 29s
Wall time: 6min 32s


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [30]:
train = create_folds(train, Config.N_FOLDS)
# train = create_strat_folds(train, TARGET, Config.N_FOLDS)

n_folds=5, seed=42


In [31]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index(ID, inplace=True)
oof.head()

Unnamed: 0_level_0,price,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,13619,1
1.0,13387,2
2.0,2772,3
3.0,666,2
4.0,14453,0


In [32]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [33]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [34]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [35]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 16 µs, sys: 0 ns, total: 16 µs
Wall time: 21 µs


In [36]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params3 = {
    "n_estimators": Config.N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}
    
lgbm_params = gpu_ify_lgbm(lgbm_params)

In [37]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror", # Normal dist
#     "objective": "reg:gamma", # Gamma dist
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}

xgb_params3 = {
    'n_estimators': Config.N_ESTIMATORS,
    'learning_rate': 0.05,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror'
}

xgb_params_gamma = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:gamma", # "reg:gamma", "reg:squarederror"
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.05,  # 0.01
}
if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [38]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)

In [39]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
    "xgb_best_params": xgb.XGBRegressor(**best_xgb_params),
    "xgb3": xgb.XGBRegressor(**xgb_params3),
    "xgb_params_gamma": xgb.XGBRegressor(**xgb_params_gamma),

    "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),
    "cat2": cb.CatBoostRegressor(**cb_params),
    "cat_best_params": cb.CatBoostRegressor(**best_cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm0": lgb.LGBMRegressor(),
    "lgbm3": lgb.LGBMRegressor(lgbm_params3),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),
    "lgbm_best_params": lgb.LGBMRegressor(**best_lgbm_params),


    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),
}

## Tree Models

In [40]:
%%time

# model_lst = ["xgb3","xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
model_lst = ["xgb_best_params", "lgbm_best_params", "cat_best_params", "xgb_params_gamma", "xgb3", "xgb1", "xgb2", "lgbm0", "lgbm1", "lgbm2", "lgbm3", "cat1", "cat2"]
# model_lst = = []
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=xgb_best_params
{}
fold: 1, Score: 319.2612288096578, Run Time: 46.84
fold: 2, Score: 314.83140588547525, Run Time: 48.15
fold: 3, Score: 316.41031411921847, Run Time: 48.85
fold: 4, Score: 319.3810146999589, Run Time: 50.98
fold: 5, Score: 323.56722375226707, Run Time: 53.13
Scores -> Adjusted: 315.70102916 , mean: 318.69023745, std: 2.98920830

=== Model Feature Importance ===
y 0.36102158
carat 0.13341436
clarity_I1 0.08066696
x 0.05838
clarity_SI2 0.04196569
clarity_VVS2 0.037829366
clarity_IF 0.037190985
color_J 0.03607598
clarity_VVS1 0.03523055
clarity_VS1 0.020747129
clarity_SI1 0.019672653
color_I 0.017734574
color_D 0.016784556
clarity_VS2 0.015575747
color_E 0.015520849
color_G 0.013883084
color_F 0.013213825
z 0.011656929
color_H 0.008479193
cut_Ideal 0.0054299952
depth 0.0032561698
cut_Fair 0.003087725
cut_Premium 0.0028709718
table 0.002728317
is_original 0.0025969257
cut_Good 0.0025568162
cut_Very Good 0.0024291591


Unnamed: 0_level_0,pred_xgb_best_params
id,Unnamed: 1_level_1
0.0,13636.50488
1.0,14036.6709
2.0,2807.94067
3.0,691.91339
4.0,15399.25586


Mode
=== Target Value Counts ===
Model Run Time: 253.01
Model=lgbm_best_params
{}
fold: 1, Score: 298.34634988544605, Run Time: 411.72
fold: 2, Score: 291.68699136367354, Run Time: 407.45
fold: 3, Score: 294.6298882879146, Run Time: 415.62
fold: 4, Score: 295.2367681590484, Run Time: 416.40
fold: 5, Score: 302.0027887576014, Run Time: 421.88
Scores -> Adjusted: 292.86270329 , mean: 296.38055729, std: 3.51785400

=== Model Feature Importance ===
y 0.19699441623199063
x 0.19437849892545422
depth 0.17627267460915438
z 0.168314867916131
carat 0.09673935522415497
table 0.07990218428490399
clarity_SI1 0.007528472137396733
clarity_VS2 0.006438808941285174
color_G 0.005962308054195324
color_E 0.005747036079824783
clarity_SI2 0.005523298353653153
cut_Ideal 0.00545194415990112
cut_Premium 0.005292304268794876
color_F 0.0050153532455878315
color_H 0.0049681869141246235
clarity_VS1 0.004814593988590585
color_D 0.004716633146320845
cut_Very Good 0.004667048028628754
is_original 0.003854335855724239

Unnamed: 0_level_0,pred_lgbm_best_params
id,Unnamed: 1_level_1
0.0,13790.10605
1.0,12188.14926
2.0,2782.11713
3.0,655.11788
4.0,14991.63043


Mode
=== Target Value Counts ===
Model Run Time: 2077.25
Model=cat_best_params
{}
fold: 1, Score: 299.7327796805236, Run Time: 38.84
fold: 2, Score: 293.2686176779583, Run Time: 40.00
fold: 3, Score: 296.34119853812206, Run Time: 41.73
fold: 4, Score: 295.9577939832175, Run Time: 43.42
fold: 5, Score: 302.22313863186616, Run Time: 44.36
Scores -> Adjusted: 294.37679747 , mean: 297.50470570, std: 3.12790823

=== Model Feature Importance ===
carat 0.2916780654970438
y 0.16307030544810144
x 0.14916016563993878
clarity_SI2 0.0955788051393412
z 0.04832496355543635
color_J 0.047347470686482096
color_I 0.041117774406408666
clarity_SI1 0.03810104604713608
color_H 0.020410493913469115
clarity_VVS2 0.01867764931049449
clarity_I1 0.015371387314011959
clarity_VVS1 0.01164449956027548
clarity_VS1 0.010561226370388224
color_D 0.009889198956955576
color_E 0.00898867948877044
clarity_IF 0.007838036798612745
color_F 0.0067744934122351075
depth 0.004185629007063709
cut_Ideal 0.0030745722426278785
table 

Unnamed: 0_level_0,pred_cat_best_params
id,Unnamed: 1_level_1
0.0,13676.42104
1.0,12192.14263
2.0,2901.575
3.0,728.53601
4.0,14872.39225


Mode
=== Target Value Counts ===
Model Run Time: 212.62
Model=xgb_params_gamma
{}
fold: 1, Score: 3898.0606449563315, Run Time: 6.76
fold: 2, Score: 3849.514816166863, Run Time: 7.84
fold: 3, Score: 3899.194363449622, Run Time: 9.09
fold: 4, Score: 3901.365350579137, Run Time: 11.08
fold: 5, Score: 3922.379299632488, Run Time: 11.66
Scores -> Adjusted: 3870.09463834 , mean: 3894.10289496, std: 24.00825662

=== Model Feature Importance ===
carat 0.33110264
y 0.29750657
x 0.24287269
z 0.06549757
clarity_SI1 0.009474104
clarity_SI2 0.007550494
clarity_IF 0.00572446
color_I 0.005653258
color_H 0.0055321204
clarity_VVS1 0.0054904507
color_G 0.003685114
color_J 0.0036021525
clarity_VVS2 0.0025136047
clarity_VS1 0.0023223537
color_E 0.0023187047
color_D 0.0022153168
clarity_VS2 0.0021623692
cut_Very Good 0.0012833503
cut_Good 0.0011209853
clarity_I1 0.00085408404
depth 0.0008469944
color_F 0.0006705894
cut_Premium 0.0
cut_Ideal 0.0
cut_Fair 0.0
is_original 0.0
table 0.0


Unnamed: 0_level_0,pred_xgb_params_gamma
id,Unnamed: 1_level_1
0.0,73.71144
1.0,73.78519
2.0,72.22537
3.0,67.37559
4.0,73.77097


Mode
=== Target Value Counts ===
Model Run Time: 50.32
Model=xgb3
{}
fold: 1, Score: 293.9553247403797, Run Time: 42.86
fold: 2, Score: 290.14835049553847, Run Time: 45.22
fold: 3, Score: 291.14316505212423, Run Time: 45.47
fold: 4, Score: 292.3973887933678, Run Time: 47.44
fold: 5, Score: 297.8757010282207, Run Time: 47.62
Scores -> Adjusted: 290.39940023 , mean: 293.10398602, std: 2.70458579

=== Model Feature Importance ===
y 0.43240333
carat 0.18689513
z 0.08496361
clarity_SI2 0.057773497
clarity_SI1 0.035919834
x 0.026572527
color_J 0.022003427
clarity_VVS2 0.019483283
color_I 0.017445471
clarity_I1 0.017142376
clarity_VS1 0.015768565
clarity_VS2 0.013745308
color_H 0.011543305
clarity_VVS1 0.011440151
clarity_IF 0.010974195
color_G 0.008150793
color_F 0.008114331
color_D 0.0061891032
color_E 0.0060649826
cut_Ideal 0.0035599978
cut_Premium 0.0006901788
cut_Fair 0.0006765477
depth 0.0005626726
table 0.00054979615
is_original 0.0005249471
cut_Good 0.0004448577
cut_Very Good 0.000397

Unnamed: 0_level_0,pred_xgb3
id,Unnamed: 1_level_1
0.0,13612.75195
1.0,12763.50098
2.0,2853.98657
3.0,679.80695
4.0,14959.10059


Mode
=== Target Value Counts ===
Model Run Time: 232.52
Model=xgb1
{}
fold: 1, Score: 300.261136499225, Run Time: 24.46
fold: 2, Score: 297.09857987114526, Run Time: 25.66
fold: 3, Score: 297.1217139446997, Run Time: 26.88
fold: 4, Score: 300.4263153978607, Run Time: 29.13
fold: 5, Score: 305.8266371373839, Run Time: 30.01
Scores -> Adjusted: 296.95957228 , mean: 300.14687657, std: 3.18730429

=== Model Feature Importance ===
y 0.44432917
carat 0.189247
clarity_SI2 0.060984682
clarity_VVS2 0.04688364
clarity_SI1 0.043726966
color_J 0.040509053
clarity_I1 0.031042345
color_I 0.02868953
clarity_VVS1 0.020526055
color_H 0.012832618
clarity_IF 0.011939833
color_E 0.009981424
clarity_VS2 0.00914228
z 0.008043429
color_D 0.008023813
clarity_VS1 0.008016802
color_F 0.006049277
cut_Ideal 0.0052890666
x 0.0049167485
color_G 0.00472682
cut_Fair 0.0011309641
cut_Good 0.0007778143
cut_Premium 0.00071043696
depth 0.00068980316
is_original 0.00064850535
cut_Very Good 0.00057741825
table 0.0005644214

Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,13991.81445
1.0,12913.0957
2.0,2835.02319
3.0,707.76874
4.0,14808.0752


Mode
=== Target Value Counts ===
Model Run Time: 140.06
Model=xgb2
{}
fold: 1, Score: 294.15562369963266, Run Time: 5.91
fold: 2, Score: 291.277911813454, Run Time: 6.95
fold: 3, Score: 292.01654974398275, Run Time: 8.59
fold: 4, Score: 293.13299444271587, Run Time: 10.29
fold: 5, Score: 298.64995591906927, Run Time: 11.14
Scores -> Adjusted: 291.25337749 , mean: 293.84660712, std: 2.59322963

=== Model Feature Importance ===
y 0.45479876
carat 0.18637744
clarity_SI2 0.06844441
x 0.044280816
clarity_SI1 0.03406336
clarity_VVS2 0.02556466
color_J 0.023295663
clarity_I1 0.019355468
color_I 0.017065585
clarity_VS2 0.01701653
clarity_VS1 0.015333255
clarity_VVS1 0.0143781165
z 0.012738794
clarity_IF 0.011985934
color_F 0.0109004965
color_H 0.010716295
color_G 0.009897274
color_E 0.0096010985
color_D 0.007891184
cut_Ideal 0.0029659434
cut_Premium 0.00057467626
cut_Fair 0.00050951843
is_original 0.0004969559
table 0.0004886386
depth 0.00047096738
cut_Good 0.000426837
cut_Very Good 0.00036131

Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,13773.1084
1.0,12376.01758
2.0,2831.91333
3.0,687.37469
4.0,14881.08301


Mode
=== Target Value Counts ===
Model Run Time: 46.86
Model=lgbm0
{}
fold: 1, Score: 301.81822778522013, Run Time: 5.20
fold: 2, Score: 297.9886424990666, Run Time: 5.64
fold: 3, Score: 299.19160103771475, Run Time: 7.16
fold: 4, Score: 299.76294967145043, Run Time: 9.04
fold: 5, Score: 305.60606462705243, Run Time: 10.40
Scores -> Adjusted: 298.20238546 , mean: 300.87349712, std: 2.67111166

=== Model Feature Importance ===
carat 0.11066666666666666
y 0.09866666666666667
z 0.07966666666666666
x 0.06966666666666667
clarity_SI2 0.060333333333333336
depth 0.058333333333333334
color_J 0.051
clarity_SI1 0.044
color_I 0.04033333333333333
color_D 0.037
clarity_I1 0.035333333333333335
color_H 0.031
color_E 0.03
clarity_VVS1 0.028666666666666667
clarity_IF 0.028333333333333332
clarity_VS2 0.02666666666666667
table 0.026
clarity_VS1 0.025333333333333333
color_F 0.024666666666666667
color_G 0.023
is_original 0.022
clarity_VVS2 0.021
cut_Ideal 0.011333333333333334
cut_Fair 0.006
cut_Premium 0.00

Unnamed: 0_level_0,pred_lgbm0
id,Unnamed: 1_level_1
0.0,13878.21112
1.0,12348.72859
2.0,2815.0948
3.0,687.40082
4.0,14898.97732


Mode
=== Target Value Counts ===
Model Run Time: 41.51
Model=lgbm1
{}
fold: 1, Score: 297.31571386785674, Run Time: 14.53
fold: 2, Score: 294.63195414350315, Run Time: 10.05
fold: 3, Score: 294.750624606023, Run Time: 12.21
fold: 4, Score: 296.19037927128073, Run Time: 13.04
fold: 5, Score: 301.84166355055635, Run Time: 14.96
Scores -> Adjusted: 294.30600444 , mean: 296.94606709, std: 2.64006265

=== Model Feature Importance ===
carat 0.15296875
y 0.14703125
z 0.131875
depth 0.116875
x 0.116875
table 0.0559375
clarity_SI2 0.03078125
color_J 0.02359375
clarity_SI1 0.0215625
color_I 0.02140625
color_H 0.018125
color_G 0.01390625
color_D 0.01390625
is_original 0.0134375
color_F 0.0134375
clarity_VS2 0.01265625
color_E 0.0125
clarity_VS1 0.01234375
cut_Premium 0.011875
cut_Ideal 0.01015625
clarity_IF 0.0096875
clarity_VVS2 0.009375
clarity_VVS1 0.00890625
cut_Very Good 0.00828125
clarity_I1 0.00828125
cut_Good 0.00234375
cut_Fair 0.001875


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,13620.89036
1.0,12412.74697
2.0,2767.30544
3.0,679.72866
4.0,14789.44288


Mode
=== Target Value Counts ===
Model Run Time: 68.81
Model=lgbm2
{}
fold: 1, Score: 481.1366234502266, Run Time: 3.46
fold: 2, Score: 470.6337274541548, Run Time: 4.34
fold: 3, Score: 480.0217206321727, Run Time: 5.97
fold: 4, Score: 478.23042049303217, Run Time: 7.92
fold: 5, Score: 482.3914465108882, Run Time: 8.74
Scores -> Adjusted: 474.32682312 , mean: 478.48278771, std: 4.15596459

=== Model Feature Importance ===
z 0.164
x 0.111
color_I 0.084
y 0.075
cut_Ideal 0.058
clarity_SI2 0.048
clarity_VS1 0.047
clarity_VVS2 0.045
clarity_VVS1 0.041
color_E 0.04
carat 0.038
clarity_SI1 0.036
depth 0.034
clarity_I1 0.027
clarity_VS2 0.026
clarity_IF 0.02
table 0.019
color_J 0.018
color_H 0.017
color_G 0.015
color_D 0.014
color_F 0.012
cut_Very Good 0.004
is_original 0.003
cut_Fair 0.002
cut_Premium 0.002
cut_Good 0.0


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,11709.42353
1.0,14352.70518
2.0,2740.57168
3.0,895.77281
4.0,13894.80972


Mode
=== Target Value Counts ===
Model Run Time: 34.67
Model=lgbm3
{}
fold: 1, Score: 297.31571436440964, Run Time: 8.25
fold: 2, Score: 294.63524161595745, Run Time: 9.98
fold: 3, Score: 294.7498699949544, Run Time: 11.21
fold: 4, Score: 296.19037937020187, Run Time: 12.72
fold: 5, Score: 301.8428075644608, Run Time: 14.66
Scores -> Adjusted: 294.30676608 , mean: 296.94680258, std: 2.64003650

=== Model Feature Importance ===
carat 0.15296875
y 0.1471875
z 0.131875
depth 0.116875
x 0.11671875
table 0.0559375
clarity_SI2 0.03078125
color_J 0.02359375
clarity_SI1 0.0215625
color_I 0.02140625
color_H 0.018125
color_G 0.01390625
color_D 0.01390625
is_original 0.0134375
color_F 0.0134375
clarity_VS2 0.01265625
color_E 0.0125
clarity_VS1 0.01234375
cut_Premium 0.011875
cut_Ideal 0.01015625
clarity_IF 0.0096875
clarity_VVS2 0.009375
clarity_VVS1 0.00890625
cut_Very Good 0.00828125
clarity_I1 0.00828125
cut_Good 0.00234375
cut_Fair 0.001875


Unnamed: 0_level_0,pred_lgbm3
id,Unnamed: 1_level_1
0.0,13620.89036
1.0,12412.74694
2.0,2767.30544
3.0,679.72867
4.0,14789.44286


Mode
=== Target Value Counts ===
Model Run Time: 60.90
Model=cat1
{}
fold: 1, Score: 297.72296419318485, Run Time: 26.06
fold: 2, Score: 291.6784022678925, Run Time: 27.23
fold: 3, Score: 291.6128145710852, Run Time: 28.36
fold: 4, Score: 295.7730765160127, Run Time: 29.61
fold: 5, Score: 300.35632376270496, Run Time: 31.50
Scores -> Adjusted: 292.01435149 , mean: 295.42871626, std: 3.41436477

=== Model Feature Importance ===
x 0.2530733423065527
y 0.17857548223168243
carat 0.17761405168248673
z 0.12402791876514688
clarity_SI2 0.07761358476545545
color_J 0.03499208068243589
clarity_SI1 0.032191704053607495
color_I 0.02955532674473758
clarity_VVS2 0.015774120262406384
color_H 0.015611113786326365
clarity_I1 0.012255643031152844
clarity_VVS1 0.008365661793805232
color_D 0.007170403591097322
color_E 0.006425951555025823
clarity_VS1 0.00618564692780601
clarity_IF 0.0057802683485366005
color_F 0.004895962636701908
depth 0.002275949668450936
cut_Ideal 0.0017033031362601223
clarity_VS2 0.001

Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,13674.64029
1.0,12508.3531
2.0,2873.14491
3.0,703.9258
4.0,14802.73695


Mode
=== Target Value Counts ===
Model Run Time: 147.48
Model=cat2
{}
fold: 1, Score: 319.5716960088878, Run Time: 83.13
fold: 2, Score: 315.53009823735727, Run Time: 4.29
fold: 3, Score: 316.54124577095706, Run Time: 5.91
fold: 4, Score: 319.8212963767485, Run Time: 7.57
fold: 5, Score: 322.9397193116316, Run Time: 8.27
Scores -> Adjusted: 316.25260213 , mean: 318.88081114, std: 2.62820901

=== Model Feature Importance ===
y 0.2396729137081466
carat 0.14630771264688647
z 0.13836182323958687
clarity_SI2 0.11513913709879933
x 0.085144322541499
color_J 0.0550207968375544
clarity_SI1 0.04729547287581312
color_I 0.04581085554042531
color_H 0.02189404762716318
clarity_VVS2 0.02049723109739917
clarity_I1 0.0175743588024976
clarity_VVS1 0.011564945146463682
color_D 0.011314144390975157
color_E 0.010379479637307776
clarity_VS1 0.00866224540139778
color_F 0.00840503672897158
clarity_IF 0.007715691449493301
color_G 0.0034420462976239396
cut_Ideal 0.0024900439916045753
depth 0.001713099554623829


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,13386.04856
1.0,12542.78712
2.0,2834.98344
3.0,761.96224
4.0,14372.34794


Mode
=== Target Value Counts ===
Model Run Time: 113.31
CPU times: user 1h 35min 50s, sys: 3min 55s, total: 1h 39min 45s
Wall time: 57min 59s


Unnamed: 0,Model,Score,StdDev,RunTime
3,xgb_params_gamma,3894.10289,24.00826,50.31614
9,lgbm2,478.48279,4.15596,34.66761
12,cat2,318.88081,2.62821,113.3122
0,xgb_best_params,318.69024,2.98921,253.00543
7,lgbm0,300.8735,2.67111,41.50599
5,xgb1,300.14688,3.1873,140.05538
2,cat_best_params,297.50471,3.12791,212.61917
10,lgbm3,296.9468,2.64004,60.90155
8,lgbm1,296.94607,2.64006,68.81129
1,lgbm_best_params,296.38056,3.51785,2077.25315


## Linear Models

In [41]:
model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
model_lst = ["lasso", "ridge",  "ridge_50"]
# model_lst = []
# all_cv_scores = run_models4features(model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

all_cv_scores.head()

Model=lasso
fold: 1, Score: 643.7632608971782, Run Time: 4.24
fold: 2, Score: 638.5058185741259, Run Time: 6.29
fold: 3, Score: 645.5589170948598, Run Time: 7.05
fold: 4, Score: 644.6071047344354, Run Time: 9.45
fold: 5, Score: 640.2022445297636, Run Time: 11.25
Scores -> Adjusted: 639.82110315 , mean: 642.52746917, std: 2.70636601


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,11371.10932
1.0,13266.83948
2.0,3577.55091
3.0,960.97536
4.0,12896.54654


Mode
=== Target Value Counts ===
Model Run Time: 43.09
Model=ridge
fold: 1, Score: 643.5668346413521, Run Time: 1.48
fold: 2, Score: 638.4624429630502, Run Time: 2.83
fold: 3, Score: 645.3704922937586, Run Time: 4.39
fold: 4, Score: 644.4580188631362, Run Time: 6.19
fold: 5, Score: 640.2129918663342, Run Time: 7.17
Scores -> Adjusted: 639.77949762 , mean: 642.41415613, std: 2.63465851


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,11367.43003
1.0,13284.75178
2.0,3572.58221
3.0,972.38309
4.0,12908.35637


Mode
=== Target Value Counts ===
Model Run Time: 25.93
Model=ridge_50
fold: 1, Score: 643.5559094214277, Run Time: 1.43
fold: 2, Score: 638.4514138275706, Run Time: 3.20
fold: 3, Score: 645.3596516273961, Run Time: 4.23
fold: 4, Score: 644.4472570894835, Run Time: 6.31
fold: 5, Score: 640.2023042180139, Run Time: 7.08
Scores -> Adjusted: 639.76861292 , mean: 642.40330724, std: 2.63469432


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,11367.73494
1.0,13285.11925
2.0,3572.51496
3.0,972.53727
4.0,12908.4995


Mode
=== Target Value Counts ===
Model Run Time: 26.43


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb_best_params,318.69024,2.98921,253.00543
1,lgbm_best_params,296.38056,3.51785,2077.25315
2,cat_best_params,297.50471,3.12791,212.61917
3,xgb_params_gamma,3894.10289,24.00826,50.31614
4,xgb3,293.10399,2.70459,232.52046


In [42]:
sample_submission.head(20)

Unnamed: 0,id,price,target_xgb_best_params,target_lgbm_best_params,target_cat_best_params,target_xgb_params_gamma,target_xgb3,target_xgb1,target_xgb2,target_lgbm0,target_lgbm1,target_lgbm2,target_lgbm3,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,193573,3969.155,859.69055,833.07401,866.29472,67.889,863.24261,854.5777,862.88324,844.24004,828.44949,829.53492,828.4495,868.52349,897.55326,1124.47124,1128.32029,1128.42433
1,193574,3969.155,2381.61206,2407.44864,2547.74689,72.39225,2446.89233,2429.01392,2451.67285,2517.56192,2516.44411,2642.13246,2516.44412,2510.52047,2537.17215,2338.33009,2328.24324,2328.14518
2,193575,3969.155,2058.93823,2201.02069,2272.7446,72.2181,2283.91919,2290.60376,2314.21143,2312.94226,2247.34907,2463.50194,2247.34916,2242.43196,2321.20291,2276.93179,2260.22677,2260.00429
3,193576,3969.155,845.57281,909.89682,825.15511,68.04408,823.10901,837.1922,822.96832,830.24297,838.36443,872.48554,838.36445,834.8684,847.65865,1288.785,1298.04216,1298.11253
4,193577,3969.155,5592.9502,5614.80172,5613.12396,73.38731,5757.1499,5573.97803,5763.28418,5695.66514,5535.9447,5719.79569,5535.9447,5652.01644,5806.48923,6795.60765,6784.09533,6784.00964
5,193578,3969.155,732.12122,766.01104,685.163,67.27662,684.28058,727.43109,684.68817,648.79182,702.79588,992.67523,702.79589,720.89036,717.35216,600.64509,611.00995,611.10975
6,193579,3969.155,12589.99707,12565.67509,12260.88241,73.70206,12292.58398,12324.79297,12210.69043,12226.73699,12317.65156,11618.073,12317.65158,12253.93612,12241.94859,11053.27722,11058.11016,11058.20624
7,193580,3969.155,3041.39062,2982.3558,2925.07031,72.27367,2892.91821,2878.87354,2924.43726,2894.70938,2925.28725,2728.58745,2925.28726,2944.63318,2915.9979,3524.39489,3516.87527,3516.75697
8,193581,3969.155,15222.61719,15145.65221,15654.40686,73.78304,14952.72168,14781.29004,15585.53809,14917.44616,14873.12457,14525.17098,14873.12442,15511.43232,15339.23239,15537.79086,15545.98746,15546.49174
9,193582,3969.155,1833.08289,1858.57798,1815.80984,71.18585,1843.74231,1848.28992,1846.95813,1959.34844,1869.86671,1900.15767,1869.86673,1773.80944,1908.17055,2245.41956,2239.28853,2239.40512


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [43]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [44]:
model_lst

['lasso', 'ridge', 'ridge_50']

In [45]:
model_lst = ["xgb1", "xgb2", "cat1", "lgbm0", "lgbm1"]

In [46]:
len(model_lst)

5

In [47]:
target_names = [f"target_{model}" for model in model_lst]
target_names

['target_xgb1', 'target_xgb2', 'target_cat1', 'target_lgbm0', 'target_lgbm1']

In [48]:
sample_submission[TARGET] = sample_submission[target_names].sum(axis=1) / len(model_lst)

In [49]:
sample_submission[[ID, TARGET]].to_csv("submission_models_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,price
129042,322615,2909.58013
129043,322616,654.64475
129044,322617,4075.96364
129045,322618,3835.98352
129046,322619,2500.6376
129047,322620,7592.65518
129048,322621,5358.86374
129049,322622,4132.83593


In [50]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 3 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/6

# sample_submission[TARGET] = sample_submission[TARGET].astype(int)

In [51]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,price
129042,322615,2841.6476
129043,322616,663.57607
129044,322617,4100.38075
129045,322618,3830.50236
129046,322619,2491.78892
129047,322620,7633.52296
129048,322621,5484.56149
129049,322622,4118.39727


In [52]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
3,xgb_params_gamma,3894.10289,24.00826,50.31614
13,lasso,642.52747,2.70637,43.09057
14,ridge,642.41416,2.63466,25.9259
15,ridge_50,642.40331,2.63469,26.43128
9,lgbm2,478.48279,4.15596,34.66761
12,cat2,318.88081,2.62821,113.3122
0,xgb_best_params,318.69024,2.98921,253.00543
7,lgbm0,300.8735,2.67111,41.50599
5,xgb1,300.14688,3.1873,140.05538
2,cat_best_params,297.50471,3.12791,212.61917


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [53]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [54]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [55]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
    id    pred_cat1
0  0.0  13674.64029
1  1.0  12508.35310
2  2.0   2873.14491
3  3.0    703.92580
4  4.0  14802.73695
Processing train_pred_cat2, train_pred_cat2.csv
    id    pred_cat2
0  0.0  13386.04856
1  1.0  12542.78712
2  2.0   2834.98344
3  3.0    761.96224
4  4.0  14372.34794
Processing train_pred_lgbm1, train_pred_lgbm1.csv
    id   pred_lgbm1
0  0.0  13620.89036
1  1.0  12412.74697
2  2.0   2767.30544
3  3.0    679.72866
4  4.0  14789.44288
Processing train_pred_lgbm2, train_pred_lgbm2.csv
    id   pred_lgbm2
0  0.0  11709.42353
1  1.0  14352.70518
2  2.0   2740.57168
3  3.0    895.77281
4  4.0  13894.80972
Processing train_pred_xgb1, train_pred_xgb1.csv
    id    pred_xgb1
0  0.0  13991.81400
1  1.0  12913.09600
2  2.0   2835.02320
3  3.0    707.76874
4  4.0  14808.07500
submission_cat1, submission_cat1.csv
       id       price
0  193573   868.52349
1  193574  2510.52047
2  193575  2242.43196
3  193576   834.86840
4  193577

In [56]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,13674.64029,13386.04856,13620.89036,11709.42353,13991.814
1,12508.3531,12542.78712,12412.74697,14352.70518,12913.096
2,2873.14491,2834.98344,2767.30544,2740.57168,2835.0232
3,703.9258,761.96224,679.72866,895.77281,707.76874
4,14802.73695,14372.34794,14789.44288,13894.80972,14808.075


In [57]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,868.52349,897.55326,828.44949,829.53492,854.5777
1,2510.52047,2537.17215,2516.44411,2642.13246,2429.014
2,2242.43196,2321.20291,2247.34907,2463.50194,2290.6038
3,834.8684,847.65865,838.36443,872.48554,837.1922
4,5652.01644,5806.48923,5535.9447,5719.79569,5573.978


In [58]:
type(preds_df)

pandas.core.frame.DataFrame

In [59]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [60]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [61]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,13674.64029,13386.04856,13620.89036,11709.42353,13991.814
1,12508.3531,12542.78712,12412.74697,14352.70518,12913.096
2,2873.14491,2834.98344,2767.30544,2740.57168,2835.0232
3,703.9258,761.96224,679.72866,895.77281,707.76874
4,14802.73695,14372.34794,14789.44288,13894.80972,14808.075


In [62]:
# preds_df[useful_features].head()

In [63]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [64]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth

100

In [65]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
3,xgb_params_gamma,3894.1,24.01,50.32
13,lasso,642.53,2.71,43.09
14,ridge,642.41,2.63,25.93
15,ridge_50,642.4,2.63,26.43
9,lgbm2,478.48,4.16,34.67
12,cat2,318.88,2.63,113.31
0,xgb_best_params,318.69,2.99,253.01
7,lgbm0,300.87,2.67,41.51
5,xgb1,300.15,3.19,140.06
2,cat_best_params,297.5,3.13,212.62
