In [5]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import re

import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostRegressor, Pool

from sklearn.svm import SVR
from xgboost import XGBRegressor


import random

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import optuna

In [8]:
import warnings
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt
import seaborn as sns
from gc import collect
from IPython.display import display_html, clear_output
from typing import Optional
from wordcloud import WordCloud, STOPWORDS


class myTextFormatter:
    """
    A simple text formatter class that provides colorama-based text formatting.

    This class initializes colorama settings for text formatting with various colors.

    Attributes:
        CLR (str): Black text with bright style.
        RED (str): Red text with bright style.
        BLUE (str): Blue text with bright style.
        CYAN (str): Cyan text with bright style.
        RESET (str): Reset all formatting and colors.

    Example:
        formatter = myTextFormatter()
        print(formatter.RED + "This is a red text." + formatter.RESET)

    Note:
        This class requires the `colorama` module to be installed.
    """

    def __init__(self):

        from colorama import init, Fore, Style

        # init(autoreset=True)

        # Colorama settings.
        self.CLR = Style.BRIGHT + Fore.BLACK
        self.RED = Style.BRIGHT + Fore.RED
        self.BLUE = Style.BRIGHT + Fore.BLUE
        self.CYAN = Style.BRIGHT + Fore.CYAN
        self.RESET = Style.RESET_ALL

        # Underline text
        self.U_START = "\033[4m"
        self.U_END = "\033[0m"

        self.FONT_COLOR = "#2F486B"
        self.BACKGROUND_COLOR = "#FFFCFA"

        self.CELL_HOVER = {  # for row hover use <tr> instead of <td>
            "selector": "td:hover",
            "props": "background-color: #FFFCFA",
        }
        self.TEXT_HIGHLIGHT = {
            "selector": "td",
            "props": "color: #DCEEF0; background-color: #757E7F; font-weight: bold",
        }
        self.INDEX_NAMES = {
            "selector": ".index_name",
            "props": "font-style: italic; background-color: #2F486B; color: #FFFCFA;",
        }
        self.HEADERS = {
            "selector": "th:not(.index_name)",
            "props": "font-style: italic; background-color: #2F486B; color: #FFFCFA;",
        }
        self.DF_STYLE = (self.INDEX_NAMES, self.HEADERS, self.TEXT_HIGHLIGHT)
        self.DF_CMAP = sns.light_palette("#D4D0A9", as_cmap=True)


def pretty_print(text, color):
    print(color + text + myTextFormatter().RESET)


def pretty_print_html(text, color, font_size=16):
    """
    Pretty print text with customizable color and font size.

    Args:
        text (str): The text to print.
        color (str): The color (as a CSS value) to apply.
        font_size (int): The font size for the printed text.
    """
    # Format the text with font size and color
    formatted_text = (
        f'<span style="color:{color}; font-size:{font_size}px;">{text}</span>'
    )

    # Display the text as HTML
    display_html(formatted_text, raw=True)


def DisplayAdjTbl(*args):
    """
    This function displays pandas tables in an adjacent manner, sourced from the below link-
    https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side
    """

    html_str = ""
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace("table", 'table style="display:inline"'), raw=True)
    collect()


formatter = myTextFormatter()


def get_null_stats(train, test):
    pretty_print_html(
        text="Train & Test Datasets NULL Stats".center(100, "="), color="grey"
    )
    a = pd.DataFrame(
        np.c_[train.isna().sum(), (train.isna().sum() / len(train)) * 100],
        columns=["# of nulls", "% of nulls"],
    )
    a.index.name = "Train"
    b = pd.DataFrame(
        np.c_[test.isna().sum(), (test.isna().sum() / len(test)) * 100],
        columns=["# of nulls", "% of nulls"],
    )
    b.index.name = "Test"

    DisplayAdjTbl(
        *[
            a.style.set_table_styles(formatter.DF_STYLE),
            b.style.set_table_styles(formatter.DF_STYLE),
        ]
    )


def get_unique_stats(train, test):
    pretty_print_html(
        text="Train & Test Datasets Unique Stats".center(100, "="), color="grey"
    )

    a = pd.DataFrame(
        train.nunique(),
        columns=["# of uniques"],
    )
    a.index.name = "Train"
    b = pd.DataFrame(
        test.nunique(),
        columns=["# of uniques"],
    )
    b.index.name = "Test"

    DisplayAdjTbl(
        *[
            a.style.set_table_styles(formatter.DF_STYLE),
            b.style.set_table_styles(formatter.DF_STYLE),
        ]
    )


def get_datasets_info(train, test):
    pretty_print_html(text="Train & Test Datasets INFO".center(100, "="), color="grey")

    def foramat_info_str(info_str, index_name):
        # Parse the info string to extract relevant details
        lines = info_str.splitlines()

        # Extract column details (skip first 3 lines and last 2 lines)
        info_data = []
        index = []
        for i, line in enumerate(lines[3:-2]):
            if i < 2:
                continue
            parts = line.split()
            index.append(parts[0])
            col_name = parts[1]
            non_null_count = parts[2]
            dtype = parts[-1]
            info_data.append([col_name, non_null_count, dtype])

        # Create a DataFrame with the parsed information
        info_df = pd.DataFrame(
            info_data,
            columns=["Column", "Non-Null Count", "Dtype"],
            index=index,
        )
        info_df.index.name = index_name

        # Optionally, add total memory usage (from the last line)
        memory_usage = lines[-1].split(": ")[-1]
        info_df.loc["Total Memory Usage"] = ["", "", memory_usage]

        return info_df.style.set_table_styles(formatter.DF_STYLE)

    # Capture the output of df.info() into a string buffer
    buffer = io.StringIO()
    train.info(buf=buffer)
    train_info_str = buffer.getvalue()

    buffer = io.StringIO()
    test.info(buf=buffer)
    test_info_str = buffer.getvalue()

    DisplayAdjTbl(
        *[
            foramat_info_str(train_info_str, "Train Info"),
            foramat_info_str(test_info_str, "Test Info"),
        ]
    )


def get_descibe_stats(train, test):
    pretty_print_html(
        text="Train & Test Datasets Describe Stats".center(100, "="), color="grey"
    )
    a = train.describe()
    a.index.name = "Train"

    b = test.describe()
    b.index.name = "Test"
    DisplayAdjTbl(
        *[
            a.style.set_table_styles(formatter.DF_STYLE),
            b.style.set_table_styles(formatter.DF_STYLE),
        ]
    )

In [9]:
train = pd.read_csv("inputs/train.csv")
test = pd.read_csv("inputs/test.csv")
sample_sub = pd.read_csv("inputs/sample_submission.csv")
Original = pd.read_csv("inputs/used_cars.csv")

train.drop(columns=["id"], inplace=True)
test.drop(columns=["id"], inplace=True)

Original[["milage", "price"]] = Original[["milage", "price"]].map(
    lambda x: int("".join(re.findall(r"\d+", x)))
)

train = pd.concat([train, Original], ignore_index=True)

In [11]:
get_datasets_info(train.drop(columns="price"), test)

get_null_stats(train.drop(columns="price"), test)

get_descibe_stats(train.drop(columns="price"), test)

get_unique_stats(train.drop(columns="price"), test)

Unnamed: 0_level_0,Column,Non-Null Count,Dtype
Train Info,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,brand,192542.0,object
1,model,192542.0,object
2,model_year,192542.0,int64
3,milage,192542.0,int64
4,fuel_type,187289.0,object
5,engine,192542.0,object
6,transmission,192542.0,object
7,ext_col,192542.0,object
8,int_col,192542.0,object
9,accident,189977.0,object

Unnamed: 0_level_0,Column,Non-Null Count,Dtype
Test Info,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,brand,125690.0,object
1,model,125690.0,object
2,model_year,125690.0,int64
3,milage,125690.0,int64
4,fuel_type,122307.0,object
5,engine,125690.0,object
6,transmission,125690.0,object
7,ext_col,125690.0,object
8,int_col,125690.0,object
9,accident,124058.0,object


Unnamed: 0_level_0,# of nulls,% of nulls
Train,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,5253.0,2.728236
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,2565.0,1.332177

Unnamed: 0_level_0,# of nulls,% of nulls
Test,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,3383.0,2.691543
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
8,0.0,0.0
9,1632.0,1.298433


Train,model_year,milage
Train,Unnamed: 1_level_1,Unnamed: 2_level_1
count,192542.0,192542.0
mean,2015.823452,65684.728927
std,5.670724,49851.51298
min,1974.0,100.0
25%,2013.0,24115.0
50%,2017.0,57550.0
75%,2020.0,95400.0
max,2024.0,405000.0

Test,model_year,milage
Test,Unnamed: 1_level_1,Unnamed: 2_level_1
count,125690.0,125690.0
mean,2015.797526,66042.58151
std,5.673797,50223.858435
min,1974.0,100.0
25%,2013.0,24500.0
50%,2017.0,57500.0
75%,2020.0,95798.0
max,2024.0,405000.0


Unnamed: 0_level_0,# of uniques
Train,Unnamed: 1_level_1
brand,57
model,1898
model_year,34
milage,6652
fuel_type,7
engine,1146
transmission,62
ext_col,319
int_col,156
accident,2

Unnamed: 0_level_0,# of uniques
Test,Unnamed: 1_level_1
brand,55
model,1891
model_year,36
milage,5700
fuel_type,7
engine,1117
transmission,52
ext_col,317
int_col,156
accident,2


In [14]:
def extract_age_features(df):
    current_year = 2024

    df["Vehicle_Age"] = current_year - df["model_year"]

    df["Mileage_per_Year"] = df["milage"] / df["Vehicle_Age"]
    df["milage_with_age"] = df.groupby("Vehicle_Age")["milage"].transform("mean")

    df["Mileage_per_Year_with_age"] = df.groupby("Vehicle_Age")[
        "Mileage_per_Year"
    ].transform("mean")

    #     df['milage_brand'] =  df.groupby('brand')['milage'].transform('mean')

    return df


def extract_engine_features(df):

    def extract_horsepower(engine):
        try:
            return float(engine.split("HP")[0])
        except:
            return None

    def extract_engine_size(engine):
        try:
            return float(engine.split(" ")[1].replace("L", ""))
        except:
            return None

    df["Horsepower"] = df["engine"].apply(extract_horsepower)
    df["Engine_Size"] = df["engine"].apply(extract_engine_size)
    df["Power_to_Weight_Ratio"] = df["Horsepower"] / df["Engine_Size"]

    return df


def extract_other_features(df):

    luxury_brands = [
        "Mercedes-Benz",
        "BMW",
        "Audi",
        "Porsche",
        "Land",
        "Lexus",
        "Jaguar",
        "Bentley",
        "Maserati",
        "Lamborghini",
        "Rolls-Royce",
        "Ferrari",
        "McLaren",
        "Aston",
        "Maybach",
    ]
    df["Is_Luxury_Brand"] = df["brand"].apply(lambda x: 1 if x in luxury_brands else 0)

    #     df['luxary_with_accident'] = df.apply(lambda row: 1 if row['Is_Luxury_Brand'] == 1  and row['accident'] == 'At least 1 accident or damage reported' else 0  ,  axis =1)

    #     df.drop(columns = ['Is_Luxury_Brand'] , inplace=True)

    return df

In [15]:
%%time

train = extract_age_features(train)
test = extract_age_features(test)

# train = extract_engine_features(train)
# test = extract_engine_features(test)

train = extract_other_features(train)
test = extract_other_features(test)

CPU times: user 78.9 ms, sys: 13.3 ms, total: 92.1 ms
Wall time: 93.4 ms


In [16]:
def update(df):

    t = 100

    cat_c = [
        "brand",
        "model",
        "fuel_type",
        "engine",
        "transmission",
        "ext_col",
        "int_col",
        "accident",
        "clean_title",
    ]
    re_ = ["model", "engine", "transmission", "ext_col", "int_col"]

    for col in re_:
        df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"

    for col in cat_c:
        df[col] = df[col].fillna("missing")
        df[col] = df[col].astype("category")

    return df


train = update(train)
test = update(test)

X = train.drop("price", axis=1)
y = train["price"]

In [24]:
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold


callbacks = [log_evaluation(period=300), early_stopping(stopping_rounds=200)]

cat_cols = train.select_dtypes(include=["object", "category"]).columns.tolist()

print(f"cat_cols--------{cat_cols}")


def get_MAE_oof(df, target, lgb_params, cat_params=None, model_type="LGBM"):

    oof_predictions = np.zeros(len(df))
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    models = []
    rmse_scores = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
        print(f"Training fold {fold + 1}/{5} with {model_type}")

        X_train, X_val = df.iloc[train_idx], df.iloc[val_idx]
        y_train, y_val = target.iloc[train_idx], target.iloc[val_idx]

        if model_type == "LGBM":
            train_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

            model = lgb.train(
                lgb_params,
                train_data,
                valid_sets=[train_data, val_data],
                valid_names=["train", "valid"],
                callbacks=callbacks,
            )

        elif model_type == "CAT":
            train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
            val_data = Pool(data=X_val, label=y_val, cat_features=cat_cols)

            model = CatBoostRegressor(**cat_params)
            model.fit(
                train_data, eval_set=val_data, verbose=150, early_stopping_rounds=200
            )

        models.append(model)

        if model_type == "LGBM":
            pred = model.predict(X_val, num_iteration=model.best_iteration)
        elif model_type == "CAT":
            pred = model.predict(X_val)

        rmse = np.sqrt(mean_squared_error(y_val, pred))
        rmse_scores.append(rmse)

        print(f"{model_type} Fold RMSE: {rmse}")

        oof_predictions[val_idx] = pred

    print(f"Mean RMSE: {np.mean(rmse_scores)}")
    return oof_predictions, models


lgb_params = {
    "objective": "MAE",
    "n_estimators": 1000,
    "random_state": 42,
    "verbose": -1,
}

oof_predictions_lgbm, models_lgbm = get_MAE_oof(X, y, lgb_params, model_type="LGBM")
X["LGBM_MAE"] = oof_predictions_lgbm


LGBM_preds = np.zeros(len(test))
for model in models_lgbm:
    LGBM_preds += model.predict(test) / len(models_lgbm)
test["LGBM_MAE"] = LGBM_preds


lgb_params = {
    "objective": "MSE",
    "n_estimators": 1000,
    "random_state": 42,
    "verbose": -1,
}

oof_predictions_lgbm, models_lgbm = get_MAE_oof(X, y, lgb_params, model_type="LGBM")

X["LGBM_MSE_diff"] = oof_predictions_lgbm - X["LGBM_MAE"]


LGBM_preds = np.zeros(len(test))
for model in models_lgbm:
    LGBM_preds += model.predict(test) / len(models_lgbm)
test["LGBM_MSE_diff"] = LGBM_preds - test["LGBM_MAE"]

cat_cols--------['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
Training fold 1/5 with LGBM
Training until validation scores don't improve for 200 rounds
[300]	train's l1: 16367.7	valid's l1: 17566.2
[600]	train's l1: 16046.9	valid's l1: 17522.2
[900]	train's l1: 15787.2	valid's l1: 17494
Did not meet early stopping. Best iteration is:
[998]	train's l1: 15733.7	valid's l1: 17489.2
LGBM Fold RMSE: 76044.50790861654
Training fold 2/5 with LGBM
Training until validation scores don't improve for 200 rounds
[300]	train's l1: 16452.4	valid's l1: 17161.6
[600]	train's l1: 16094.5	valid's l1: 17105.8
[900]	train's l1: 15864.6	valid's l1: 17086.6
Did not meet early stopping. Best iteration is:
[998]	train's l1: 15802.2	valid's l1: 17079.5
LGBM Fold RMSE: 68946.49628175847
Training fold 3/5 with LGBM
Training until validation scores don't improve for 200 rounds
[300]	train's l1: 16438.3	valid's l1: 17230.3
[600]	train's l1: 16069.1	valid

In [27]:
def objective_lgb(trial):
    lgb_params = {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        "max_depth": trial.suggest_int("max_depth", 5, 50),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-4, 1.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-4, 10.0),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "subsample": trial.suggest_uniform("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.2, 1.0),
        "n_estimators": 1000,
        "random_state": 42,
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

        model = lgb.train(
            lgb_params, train_data, valid_sets=[val_data], callbacks=callbacks
        )

        y_pred = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)


def objective_cat(trial):

    cat_params = {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 1e-1),
        "depth": trial.suggest_int("depth", 5, 16),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-4, 10.0),
        "iterations": 1000,
        "random_strength": trial.suggest_int("random_strength", 0, 100),
        "cat_features": cat_cols,
        "random_seed": 42,
        "early_stopping_rounds": 200,
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_scores_cat = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        train_pool = Pool(X_train, y_train, cat_features=cat_cols)
        val_pool = Pool(X_val, y_val, cat_features=cat_cols)

        model_cat = CatBoostRegressor(**cat_params)
        model_cat.fit(train_pool, eval_set=val_pool, verbose=300)

        y_pred_cat = model_cat.predict(X_val)
        rmse_cat = np.sqrt(mean_squared_error(y_val, y_pred_cat))
        rmse_scores_cat.append(rmse_cat)

    return np.mean(rmse_scores_cat)


# study_lgb = optuna.create_study(direction='minimize')
# study_lgb.optimize(objective_lgb, n_trials=20)

# print("Best LGBM Parameters: ", study_lgb.best_params)
# print("Best LGBM RMSE: ", study_lgb.best_value)

study_cat = optuna.create_study(direction="minimize")
study_cat.optimize(objective_cat, n_trials=2)

print("Best CatBoost Parameters: ", study_cat.best_params)
print("Best CatBoost RMSE: ", study_cat.best_value)

[I 2024-09-20 17:55:42,108] A new study created in memory with name: no-name-b7f2c601-a3c5-453a-8b89-5b4c2b295b90


0:	learn: 78271.5121586	test: 80956.3568315	best: 80956.3568315 (0)	total: 195ms	remaining: 3m 15s
300:	learn: 77944.7374520	test: 80653.3429998	best: 80653.3429998 (300)	total: 29s	remaining: 1m 7s
600:	learn: 77640.0799157	test: 80370.9085719	best: 80370.9085719 (600)	total: 1m 1s	remaining: 41s
900:	learn: 77348.7376757	test: 80101.3291578	best: 80101.3291578 (900)	total: 1m 34s	remaining: 10.4s
999:	learn: 77257.6136877	test: 80016.9282979	best: 80016.9282979 (999)	total: 1m 46s	remaining: 0us

bestTest = 80016.9283
bestIteration = 999

0:	learn: 79871.6403507	test: 74442.9697014	best: 74442.9697014 (0)	total: 138ms	remaining: 2m 17s
300:	learn: 79553.3524038	test: 74110.2940732	best: 74110.2940732 (300)	total: 34.1s	remaining: 1m 19s
600:	learn: 79244.0403813	test: 73788.5536980	best: 73788.5536980 (600)	total: 1m 10s	remaining: 46.7s
900:	learn: 78955.0778985	test: 73489.5517375	best: 73489.5517375 (900)	total: 1m 46s	remaining: 11.7s
999:	learn: 78867.5558194	test: 73398.4130237

[I 2024-09-20 18:06:21,731] Trial 0 finished with value: 77769.52059564216 and parameters: {'learning_rate': 0.00015614070239840496, 'depth': 12, 'l2_leaf_reg': 0.0001887853156450769, 'random_strength': 52}. Best is trial 0 with value: 77769.52059564216.


0:	learn: 78252.1213641	test: 80937.1410648	best: 80937.1410648 (0)	total: 48ms	remaining: 47.9s
300:	learn: 74123.5468055	test: 77050.3657130	best: 77050.3657130 (300)	total: 12.7s	remaining: 29.6s
600:	learn: 73063.1846480	test: 76095.9347017	best: 76095.9347017 (600)	total: 25s	remaining: 16.6s
900:	learn: 72728.4546341	test: 75812.3518431	best: 75812.3518431 (900)	total: 37.3s	remaining: 4.1s
999:	learn: 72662.2249597	test: 75760.7924337	best: 75760.7924337 (999)	total: 41s	remaining: 0us

bestTest = 75760.79243
bestIteration = 999

0:	learn: 79852.4587857	test: 74422.8665399	best: 74422.8665399 (0)	total: 46.6ms	remaining: 46.5s
300:	learn: 75783.9190716	test: 70080.8744574	best: 70080.8744574 (300)	total: 12.7s	remaining: 29.6s
600:	learn: 74795.7808397	test: 69055.0471290	best: 69055.0471290 (600)	total: 25.3s	remaining: 16.8s
900:	learn: 74465.3034407	test: 68729.3410130	best: 68729.3410130 (900)	total: 37.7s	remaining: 4.15s
999:	learn: 74398.4205296	test: 68667.4837370	best: 

[I 2024-09-20 18:09:53,580] Trial 1 finished with value: 73232.18593712663 and parameters: {'learning_rate': 0.004690966916533772, 'depth': 5, 'l2_leaf_reg': 0.0008867775971736063, 'random_strength': 52}. Best is trial 1 with value: 73232.18593712663.


Best CatBoost Parameters:  {'learning_rate': 0.004690966916533772, 'depth': 5, 'l2_leaf_reg': 0.0008867775971736063, 'random_strength': 52}
Best CatBoost RMSE:  73232.18593712663


In [29]:
lgb_params_1 = {
    "learning_rate": 0.08762612216851398,
    "max_depth": 19,
    "reg_alpha": 0.00821522329400186,
    "reg_lambda": 1.2490975172659358,
    "num_leaves": 22,
    "subsample": 0.2996060388743155,
    "colsample_bytree": 0.20643426461715494,
    "n_estimators": 1000,
    "random_state": 42,
}

cat_params = {
    "learning_rate": 0.004690966916533772,
    "depth": 5,
    "l2_leaf_reg": 0.0008867775971736063,
    "random_strength": 52,
    "cat_features": cat_cols,
    "random_seed": 42,
    "early_stopping_rounds": 200,
    "iterations": 1000,
}

In [30]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []
rmse_scores_cat = []
LGBM_model = []
CAT_model = []

callbacks = [log_evaluation(period=150), early_stopping(stopping_rounds=200)]

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model_1 = lgb.train(
        lgb_params_1,
        train_data,
        valid_sets=[train_data, val_data],
        valid_names=["train", "valid"],
        callbacks=callbacks,
    )
    # model_2 = lgb.train(lgb_params_2,
    #               train_data,
    #               valid_sets=[train_data, val_data],
    #               valid_names=['train', 'valid'],
    #               callbacks=callbacks
    #               )

    LGBM_model.append(model_1)
    # LGBM_model.append(model_2)

    y_pred = model_1.predict(X_val)  # * 0.5 + model_2.predict(X_val) * 0.5

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

    print(f"LGBM Fold RMSE: {rmse}")

    model_cat = CatBoostRegressor(**cat_params)

    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_val, y_val, cat_features=cat_cols)
    model_cat.fit(train_pool, eval_set=val_pool, verbose=300)

    CAT_model.append(model_cat)
    y_pred_cat = model_cat.predict(X_val)
    rmse_cat = np.sqrt(mean_squared_error(y_val, y_pred_cat))
    rmse_scores_cat.append(rmse_cat)

    print(f"CAT Fold RMSE: {rmse_cat}")


print(f"Mean LGBM RMSE: {np.mean(rmse_scores)}")
print(f"Mean CAT RMSE: {np.mean(rmse_scores_cat)}")

Training until validation scores don't improve for 200 rounds
[150]	train's l2: 4.6422e+09	valid's l2: 5.66571e+09
Early stopping, best iteration is:
[61]	train's l2: 4.89177e+09	valid's l2: 5.64553e+09
LGBM Fold RMSE: 75136.71436413504
0:	learn: 78252.1213641	test: 80937.1410648	best: 80937.1410648 (0)	total: 46ms	remaining: 45.9s
300:	learn: 74123.5468055	test: 77050.3657130	best: 77050.3657130 (300)	total: 8.24s	remaining: 19.1s
600:	learn: 73063.1846480	test: 76095.9347017	best: 76095.9347017 (600)	total: 16.1s	remaining: 10.7s
900:	learn: 72728.4546341	test: 75812.3518431	best: 75812.3518431 (900)	total: 24s	remaining: 2.63s
999:	learn: 72662.2249597	test: 75760.7924337	best: 75760.7924337 (999)	total: 26.4s	remaining: 0us

bestTest = 75760.79243
bestIteration = 999

CAT Fold RMSE: 75760.79243372563
Training until validation scores don't improve for 200 rounds
[150]	train's l2: 4.87247e+09	valid's l2: 4.62313e+09
Early stopping, best iteration is:
[57]	train's l2: 5.16451e+09	vali

In [31]:
LGBM_preds = np.zeros(len(test))
for model in LGBM_model:
    LGBM_preds += model.predict(test) / len(LGBM_model)

In [32]:
CAT_preds = np.zeros(len(test))
for model in CAT_model:
    CAT_preds += model.predict(test) / len(CAT_model)

In [33]:
test_preds = LGBM_preds * 0.8 + CAT_preds * 0.2

In [35]:
sample_sub["price"] = test_preds
sample_sub.to_csv("submission.csv", index=False)
sample_sub.head()

Unnamed: 0,id,price
0,188533,20609.843644
1,188534,78275.057864
2,188535,59836.34754
3,188536,29589.435553
4,188537,30674.214075
