In [32]:
import numpy as np
import pandas as pd
import os, gc, datetime, pickle, warnings
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

import lightgbm as lgb

from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as sf

pd.DataFrame.iteritems = pd.DataFrame.items

warnings.filterwarnings('ignore')

In [33]:
dir_ = 'D:/Github/knowledge/time-series/data/daikin/sell-out/'

raw_data_dir = dir_
processed_data_dir = dir_+'processed/'
model_data_dir = dir_+'model/'
predict_data_dir = dir_+'predict/'

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

SEED = 42
TARGET_COL = 'qty'
DATE_COL = 'date'
P_HORIZON = 12
STEP = 4

group_columns = ['item','location_name']

COMMON_LAGS = [26,39,52,78,104]
LAGS = list(range(STEP,STEP*3+P_HORIZON+1))
ROLL_LAGS = [4,8,12,16,20,26,39,52]
ROLL_WINDOWS = [4,8,12,16,26,39,52]
EWM_LAGS = [4,8,12,16,20,26,39,52]
EWM_ALPHAS = [0.99, 0.95, 0.9, 0.8, 0.5, 0.2, 0.1]
ENC_COLS = [
    ['item'],
    # ['location_name'],
    ['item','location_name'],
]

In [34]:
# remove_features = [
#     'item',
#     'location_name'
# ] + [TARGET_COL, DATE_COL]
# features = [c for c in df.columns if c not in remove_features]

In [35]:
spark = SparkSession.builder \
    .appName("Training Models Week v1") \
    .master("local[*]") \
    .config("spark.driver.memory", "64g") \
    .getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [36]:
df = spark.read.csv(f'{processed_data_dir}features_data.csv', header=True, inferSchema=True)
df = df.withColumn('partition_key', sf.col('location_name'))
# df = df.withColumn('partition_key', sf.concat(sf.col('location_name'),sf.lit('_'),sf.col('item')))
# partition_keys = df.filter('date < "2024-01-01"').select('partition_key').drop_duplicates().toPandas()['partition_key'].to_list()
# df = df.filter(sf.col('partition_key').isin(partition_keys))
inputs = df.repartition(spark.sparkContext.defaultParallelism, 'partition_key').cache()

In [37]:
other_features = [
    'is_holiday',
    'is_offline',
    'year',
    'week_of_year',
    'release',
    # 'location_name_le',
    'item_le',
 ]

stages = []
for i, start in enumerate(range(1, P_HORIZON + 1, STEP), 1):
    end = start + STEP - 1
    if end > P_HORIZON:
        end = P_HORIZON  # To ensure end doesn't exceed P_HORIZON

    _lags = list(range(i*STEP, i*STEP+STEP*3+1,1)) + COMMON_LAGS
    _lag_cols = [f'{TARGET_COL}_lag_{lag}' for lag in _lags]
    _roll_cols = []
    for lag in [l for l in ROLL_LAGS if l>=i*STEP]:
        for window in ROLL_WINDOWS:
            _roll_cols.append(f'{TARGET_COL}_roll_mean_lag_{lag}_win_{window}')
            _roll_cols.append(f'{TARGET_COL}_roll_std_lag_{lag}_win_{window}')
    _ewm_cols = []
    for lag in [l for l in EWM_LAGS if l>=i*STEP]:
        for alpha in EWM_ALPHAS:
            _ewm_cols.append(f'{TARGET_COL}_ewm_lag_{lag}_alpha_{str(alpha).replace(".", "")}')
    _enc_cols = []
    for col in ENC_COLS:
        col_name = '_'.join(col)
        _enc_cols.append(f'enc_{col_name}_mean')
        _enc_cols.append(f'enc_{col_name}_std')
    stages.append({
        "stage": i, "start": start, "end": end, 
        "other_features": other_features,
        "lag_features": _lag_cols,
        "roll_features": _roll_cols,
        "ewm_features": _ewm_cols,
        "enc_features": _enc_cols,
    })

# print(stages)

In [38]:
params = {
    "stage_1": {
        "boost_from_average": False,
        "boosting_type": "gbdt",
        "feature_fraction": 0.4,
        "learning_rate": 0.05,
        "max_bin": 100,
        "metric": "rmse",
        "min_data_in_leaf": 2 ** 12 - 1,
        "num_leaves": 2 ** 11 - 1,
        "n_estimators": 1500,
        "objective": "tweedie",
        "subsample": 0.6,
        "subsample_freq": 1,
        "tweedie_variance_power": 1.2,
        "verbose": -1,
    },
    "stage_2": {
        "boost_from_average": False,
        "boosting_type": "gbdt",
        "feature_fraction": 0.6,
        "learning_rate": 0.05,
        "max_bin": 150,
        "metric": "rmse",
        "min_data_in_leaf": 2 ** 12 - 1,
        "num_leaves": 2 ** 11 - 1,
        "n_estimators": 1500,
        "objective": "tweedie",
        "subsample": 0.6,
        "subsample_freq": 5,
        "tweedie_variance_power": 1.1,
        "verbose": -1,
    },
    "stage_3": {
        "boost_from_average": False,
        "boosting_type": "gbdt",
        "feature_fraction": 0.6,
        "learning_rate": 0.05,
        "max_bin": 100,
        "metric": "rmse",
        "min_data_in_leaf": 2 ** 12 - 1,
        "num_leaves": 2 ** 11 - 1,
        "n_estimators": 1500,
        "objective": "tweedie",
        "subsample": 0.6,
        "subsample_freq": 1,
        "tweedie_variance_power": 1.2,
        "verbose": -1,
    },
}


params_adjust = {
    "stage_1": {
        "boost_from_average": False,
        "boosting_type": "gbdt",
        "feature_fraction": 0.4,
        "learning_rate": 0.05,
        "max_bin": 100,
        "metric": "rmse",
        "min_data_in_leaf": 2 ** 12 - 1,
        "num_leaves": 2 ** 11 - 1,
        "n_estimators": 1500,
        "objective": "tweedie",
        "subsample": 0.6,
        "subsample_freq": 1,
        "tweedie_variance_power": 1.1,
        "verbose": -1,
    },
    "stage_2": {
        "boost_from_average": False,
        "boosting_type": "gbdt",
        "feature_fraction": 0.4,
        "learning_rate": 0.05,
        "max_bin": 100,
        "metric": "rmse",
        "min_data_in_leaf": 2 ** 12 - 1,
        "num_leaves": 2 ** 11 - 1,
        "n_estimators": 1500,
        "objective": "tweedie",
        "subsample": 0.6,
        "subsample_freq": 1,
        "tweedie_variance_power": 1.1,
        "verbose": -1,
    },
    "stage_3": {
        "boost_from_average": False,
        "boosting_type": "gbdt",
        "feature_fraction": 0.5,
        "learning_rate": 0.05,
        "max_bin": 150,
        "metric": "rmse",
        "min_data_in_leaf": 2 ** 12 - 1,
        "num_leaves": 2 ** 11 - 1,
        "n_estimators": 1500,
        "objective": "tweedie",
        "subsample": 0.6,
        "subsample_freq": 1,
        "tweedie_variance_power": 1.1,
        "verbose": -1,
    },
}


In [39]:
def get_forecast(df: pd.DataFrame ) -> pd.DataFrame:
    
    label_cols = ['item']
    # label_cols = ['location_name']
    le = LabelEncoder()
    for c in label_cols:
        df[f'{c}_le'] = le.fit_transform(df[c])

    CUT_OFF = df[DATE_COL].max() - pd.Timedelta(weeks=P_HORIZON)
    partition_key = df.loc[df.index.min(),'partition_key']
    list_predict_pd = []
    for stage in stages:
        features = [c for k in stage.keys() if k.endswith('features') for c in stage.get(k)]
        stage_number, start, end = stage.get('stage'), stage.get('start'), stage.get('end')
        cut_off_start = CUT_OFF + pd.Timedelta(weeks=start)
        cut_off_end = CUT_OFF + pd.Timedelta(weeks=end)

        idx_train, idx_test = (df[DATE_COL]<=CUT_OFF), (df[DATE_COL]>=cut_off_start) & (df[DATE_COL]<=cut_off_end)
        X, y = df[features], df[TARGET_COL]
        X_train, y_train = X[idx_train], y[idx_train]
        X_test, y_test = X[idx_test], y[idx_test]

        best_params = params.get(f'stage_{stage_number}')
        # best_params = {
        #         'boosting_type': 'gbdt',
        #         'objective': 'tweedie',
        #         'tweedie_variance_power': 1.1,
        #         'metric': 'rmse',
        #         'subsample': 0.5,
        #         'subsample_freq': 1,
        #         'learning_rate': 0.03,
        #         'num_leaves': 2 ** 11 - 1,
        #         'min_data_in_leaf': 2 ** 12 - 1,
        #         'feature_fraction': 0.5,
        #         'max_bin': 100,
        #         'n_estimators': 1500,
        #         'boost_from_average': False,
        #         'verbose': -1,
        # }
        if partition_key == 'Tiền Giang':
            best_params['tweedie_variance_power'] = 1.4
            best_params['learning_rate'] = 0.01
        elif partition_key == 'Bình Dương':
            best_params['tweedie_variance_power'] = 1.1
            best_params['learning_rate'] = 0.15
        else:
            pass
        
        if partition_key in [
"Mỹ Tho",
"Bắc Kạn",
"Lào Cai",
"Tuyên Quang",
"Hà Giang",
"Lai Châu",
"Điện Biên",
"Yên Bái",
"Sơn La",
"Bắc Giang",
"Cao Bằng",
"Hòa Bình",
"Hà Nam",
"Hà Tĩnh",
"Kon Tum",
"Lạng Sơn",
"Vĩnh Phúc",
"Thái Nguyên",
"Ninh Bình",
"Quảng Ninh",
"Phú Thọ",
"Hưng Yên",
"Hải Dương",
"Quảng Bình",
"Thái Bình",
"Nam Định",
"Quảng Trị",
"Bắc Ninh",
"Đắk Nông",
"Nghệ An",
"Thanh Hóa",
"Phú Yên",
        ]:
            best_params = params_adjust.get(f'stage_{stage_number}')

        # best_params = {"max_depth": None, "max_features": 0.8, "min_samples_leaf": 5, "min_samples_split": 2, "n_estimators": 200, "max_samples": 0.4}
        # model = RandomForestRegressor(
        #     **best_params,
        #     n_jobs=-1,
        #     random_state=SEED,
        # )
        model = lgb.LGBMRegressor(**best_params,random_state=SEED,n_jobs=-1).fit(X_train, y_train, eval_metric=root_mean_squared_error)

        model.fit(X_train, y_train)

        _model_data_dir = f'{model_data_dir}{partition_key}/'
        os.makedirs(_model_data_dir, exist_ok=True)
        _model_path = f'{_model_data_dir}model_stage_{stage_number}.bin'
        pickle.dump(model, open(_model_path, 'wb'))

        y_pred = model.predict(X_test)
        pred_col = f'{TARGET_COL}_hat'
        predict_pd = pd.DataFrame({
            pred_col : y_pred
            }, index=X_test.index)
        predict_pd = predict_pd.join(df)[group_columns+[DATE_COL,TARGET_COL,pred_col,'partition_key']]
        predict_pd['stage'] = stage_number
        list_predict_pd.append(predict_pd)
        # break
    concat_predict_pd = pd.concat(list_predict_pd)
    return concat_predict_pd

In [40]:
# structure of the dataset returned by the function
result_schema =StructType([
    StructField('item',StringType()),
    StructField('location_name',StringType()),
    StructField('date',TimestampType()),
    StructField('qty', FloatType()),
    StructField('qty_hat', FloatType()),
    StructField('partition_key', StringType()),
    StructField('stage', IntegerType()),
    # StructField('best_params', StringType()),
])
forecast = (
  inputs
    .groupBy('partition_key')
    .applyInPandas(get_forecast, schema=result_schema)
    .withColumn('training_date', sf.current_date())
)
# .show()
# COMMAND ----------
forecast.write.format('parquet').mode('overwrite').save(f'{predict_data_dir}sellout_v9')

In [41]:
predict_df = spark.read.parquet(f'{predict_data_dir}sellout_v9')
predict_pdf = predict_df.toPandas()
# predict_pdf = predict_pdf[predict_pdf['location_name']=='Hồ Chí Minh']

y_actual = predict_pdf.qty
y_predict = predict_pdf.qty_hat
print(root_mean_squared_error(y_actual,y_predict))
print((sum(y_actual), sum(y_predict)))
# 2.562573 
# 2.4085937 v3
# (27887.0, 37678.3501129481)

# 2.3762782 v7
# (27887.0, 32574.525435439835)

# 2.8254292 v9
# (27887.0, 35896.13929265072)

2.7840652
(27887.0, 35498.36761380592)


In [42]:
rmse_results = predict_pdf.groupby('location_name').agg(
    rmse=('qty', lambda x: root_mean_squared_error(x, predict_pdf.loc[x.index, 'qty_hat']))
)

display(rmse_results.sort_values('rmse', ascending=False))

Unnamed: 0_level_0,rmse
location_name,Unnamed: 1_level_1
Hồ Chí Minh,14.984781
Bình Dương,4.208397
Đồng Nai,3.536896
Vĩnh Long,2.427471
Long An,2.426690
...,...
Điện Biên,0.136714
Tuyên Quang,0.114223
Lào Cai,0.108064
Bắc Kạn,0.072358


In [43]:
# df = pd.read_csv(f'{processed_data_dir}features_data.csv')
# df[DATE_COL] = pd.to_datetime(df[DATE_COL])

# CUT_OFF = df[DATE_COL].max() - pd.Timedelta(weeks=P_HORIZON)
# CUT_OFF

df = spark.read.csv(f'{processed_data_dir}features_data.csv', header=True, inferSchema=True)
df = df.withColumn('partition_key', sf.col('location_name'))
df = df.filter("date between '2023-01-01' and '2024-01-02'")
df.select(group_columns+[DATE_COL,TARGET_COL,'partition_key']).write.format('parquet').mode('append').save(f'{predict_data_dir}sellout_v9')

In [42]:
df = pd.read_csv(f'{processed_data_dir}features_data.csv')
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df['partition_key'] = df['location_name']
df = df[df['location_name']=='Cần Thơ']

import sys
# Get the parent directory
parent_dir = os.path.dirname(os.getcwd())
# Add the parent directory to sys.path
sys.path.append(parent_dir)

from utils import *

df = Util.reduce_mem_usage(df)


label_cols = ['item']
# label_cols = ['item','location_name']
le = LabelEncoder()
for c in label_cols:
    df[f'{c}_le'] = le.fit_transform(df[c])

n_count = df[label_cols].drop_duplicates().shape[0]
step=3
tscv = TimeSeriesSplit(n_splits=3, test_size=n_count*step)

CUT_OFF = df[DATE_COL].max() - pd.Timedelta(weeks=P_HORIZON)
partition_key = df.loc[df.index.min(),'partition_key']
list_predict_pd = []
for stage in stages:
    features = [c for k in stage.keys() if k.endswith('features') for c in stage.get(k)]
    stage_number, start, end = stage.get('stage'), stage.get('start'), stage.get('end')
    cut_off_start = CUT_OFF + pd.Timedelta(weeks=start)
    cut_off_end = CUT_OFF + pd.Timedelta(weeks=end)

    idx_train, idx_test = (df[DATE_COL]<=CUT_OFF), (df[DATE_COL]>=cut_off_start) & (df[DATE_COL]<=cut_off_end)
    X, y = df[features], df[TARGET_COL]
    X_train, y_train = X[idx_train], y[idx_train]
    X_test, y_test = X[idx_test], y[idx_test]

    best_params = {
                'boosting_type': 'gbdt',
                'objective': 'tweedie',
                'tweedie_variance_power': 1.1,
                'metric': 'rmse',
                'subsample': 0.6,
                'subsample_freq': 1,
                'learning_rate': 0.03,
                'num_leaves': 2 ** 11 - 1,
                'min_data_in_leaf': 2 ** 12 - 1,
                'feature_fraction': 0.6,
                'max_bin': 100,
                'n_estimators': 1500,
                'boost_from_average': False,
                'verbose': -1,
        }

    model = lgb.LGBMRegressor(**best_params,random_state=SEED,n_jobs=-1).fit(X_train, y_train, eval_metric=root_mean_squared_error)

    model.fit(X_train, y_train)
    # print(f"Best Parameters: {model.best_params_}")

    _model_data_dir = f'{model_data_dir}{partition_key}/'
    # _model_data_dir = f'{model_data_dir}common/'
    os.makedirs(_model_data_dir, exist_ok=True)
    _model_path = f'{_model_data_dir}lgb_model_stage_{stage_number}.bin'
    pickle.dump(model, open(_model_path, 'wb'))

    y_pred = model.predict(X_test)
    pred_col = f'{TARGET_COL}_hat'
    predict_pd = pd.DataFrame({
        pred_col : y_pred
        }, index=X_test.index)
    predict_pd = predict_pd.join(df)[group_columns+[DATE_COL,TARGET_COL,pred_col,'partition_key']]
    # predict_pd = predict_pd.join(df)[group_columns+[DATE_COL,TARGET_COL,pred_col]]
    predict_pd['stage'] = stage_number
    list_predict_pd.append(predict_pd)
    # break
concat_predict_pd = pd.concat(list_predict_pd)


y_actual = concat_predict_pd.qty
y_predict = concat_predict_pd.qty_hat
print(root_mean_squared_error(y_actual,y_predict))
print((sum(y_actual), sum(y_predict)))
# 1.8911243768582888
# (339.0, 798.9878884064677)


1.638890385974121
(421.0, 743.719299288032)


In [31]:
df = pd.read_csv(f'{processed_data_dir}features_data.csv')
df[DATE_COL] = pd.to_datetime(df[DATE_COL])
df['partition_key'] = df['location_name']
df = df[df['location_name']=='Nam Định']

import sys
# Get the parent directory
parent_dir = os.path.dirname(os.getcwd())
# Add the parent directory to sys.path
sys.path.append(parent_dir)

from utils import *

df = Util.reduce_mem_usage(df)


label_cols = ['item']
# label_cols = ['item','location_name']
le = LabelEncoder()
for c in label_cols:
    df[f'{c}_le'] = le.fit_transform(df[c])

n_count = df[label_cols].drop_duplicates().shape[0]
step=3
tscv = TimeSeriesSplit(n_splits=3, test_size=n_count*step)

CUT_OFF = df[DATE_COL].max() - pd.Timedelta(weeks=P_HORIZON)
partition_key = df.loc[df.index.min(),'partition_key']
list_predict_pd = []
for stage in stages:
    features = [c for k in stage.keys() if k.endswith('features') for c in stage.get(k)]
    stage_number, start, end = stage.get('stage'), stage.get('start'), stage.get('end')
    cut_off_start = CUT_OFF + pd.Timedelta(weeks=start)
    cut_off_end = CUT_OFF + pd.Timedelta(weeks=end)

    idx_train, idx_test = (df[DATE_COL]<=CUT_OFF), (df[DATE_COL]>=cut_off_start) & (df[DATE_COL]<=cut_off_end)
    X, y = df[features], df[TARGET_COL]
    X_train, y_train = X[idx_train], y[idx_train]
    X_test, y_test = X[idx_test], y[idx_test]

    # best_params = {
    #         'boosting_type': 'gbdt',
    #         'objective': 'tweedie',
    #         # 'tweedie_variance_power': 1.1,
    #         'metric': 'rmse',
    #         'subsample': 0.5,
    #         'subsample_freq': 1,
    #         # 'learning_rate': 0.03,
    #         'num_leaves': 2 ** 11 - 1,
    #         'min_data_in_leaf': 2 ** 12 - 1,
    #         'feature_fraction': 0.5,
    #         'max_bin': 100,
    #         'n_estimators': 1500,
    #         'boost_from_average': False,
    #         'verbose': -1,
    #     }
    # param_grid = {
    # 'tweedie_variance_power': [1.0, 1.1, 1.2, 1.4],
    # 'learning_rate': [0.01, 0.05, 0.1, 0.15],
    # }
    param_grid = {
        'boosting_type': ['gbdt'],  # Tweedie with 'gbdt' or 'dart' boosting methods
        'objective': ['tweedie'],
        'tweedie_variance_power': [1.1, 1.2],  # Adjust power for Tweedie distribution
        'metric': ['rmse'],
        'learning_rate': [0.01, 0.03, 0.05],  # Fine-tuning around the default 0.03
        'subsample': [0.4, 0.5, 0.6],  # Control sample size per tree
        'subsample_freq': [1, 5],  # Frequency of subsampling
        'num_leaves': [2 ** 11 - 1],  # Range around current num_leaves
        'min_data_in_leaf': [2 ** 12 - 1],  # Control leaf size
        'feature_fraction': [0.4, 0.5, 0.6],  # Percentage of features used per iteration
        'max_bin': [50, 100, 150],  # Range around current max_bin
        'n_estimators': [1500],  # Control the number of boosting iterations
        'boost_from_average': [False],
        'verbose': [-1]
    }
    # model = lgb.LGBMRegressor(**best_params,random_state=SEED,n_jobs=-1).fit(X_train, y_train, eval_metric=root_mean_squared_error)
    # best_params = {"max_depth": None, "max_features": 0.8, "min_samples_leaf": 5, "min_samples_split": 2, "n_estimators": 200, "max_samples": 0.4}
    # model = RandomForestRegressor(
    #         **best_params,
    #         n_jobs=-1,
    #         random_state=SEED,
    #     )

    # lgb_model = lgb.LGBMRegressor(**best_params,random_state=SEED,n_jobs=-1)
    lgb_model = lgb.LGBMRegressor(random_state=SEED,n_jobs=-1)
    # Perform grid search
    model = GridSearchCV(estimator=lgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=tscv, verbose=1)

    model.fit(X_train, y_train)
    print(f"Best Parameters: {model.best_params_}")

    _model_data_dir = f'{model_data_dir}{partition_key}/'
    # _model_data_dir = f'{model_data_dir}common/'
    os.makedirs(_model_data_dir, exist_ok=True)
    _model_path = f'{_model_data_dir}lgb_model_stage_{stage_number}.bin'
    pickle.dump(model, open(_model_path, 'wb'))

    y_pred = model.predict(X_test)
    pred_col = f'{TARGET_COL}_hat'
    predict_pd = pd.DataFrame({
        pred_col : y_pred
        }, index=X_test.index)
    predict_pd = predict_pd.join(df)[group_columns+[DATE_COL,TARGET_COL,pred_col,'partition_key']]
    # predict_pd = predict_pd.join(df)[group_columns+[DATE_COL,TARGET_COL,pred_col]]
    predict_pd['stage'] = stage_number
    list_predict_pd.append(predict_pd)
    # break
concat_predict_pd = pd.concat(list_predict_pd)


y_actual = concat_predict_pd.qty
y_predict = concat_predict_pd.qty_hat
print(root_mean_squared_error(y_actual,y_predict))
print((sum(y_actual), sum(y_predict)))
# 1.9183264231803465
# (421.0, 889.8702807955929)

# Fitting 3 folds for each of 324 candidates, totalling 972 fits
# Best Parameters: {'boost_from_average': False, 'boosting_type': 'gbdt', 'feature_fraction': 0.4, 'learning_rate': 0.05, 'max_bin': 100, 'metric': 'rmse', 'min_data_in_leaf': 4095, 'n_estimators': 1500, 'num_leaves': 2047, 'objective': 'tweedie', 'subsample': 0.6, 'subsample_freq': 1, 'tweedie_variance_power': 1.1, 'verbose': -1}
# Fitting 3 folds for each of 324 candidates, totalling 972 fits
# Best Parameters: {'boost_from_average': False, 'boosting_type': 'gbdt', 'feature_fraction': 0.4, 'learning_rate': 0.05, 'max_bin': 100, 'metric': 'rmse', 'min_data_in_leaf': 4095, 'n_estimators': 1500, 'num_leaves': 2047, 'objective': 'tweedie', 'subsample': 0.6, 'subsample_freq': 1, 'tweedie_variance_power': 1.1, 'verbose': -1}
# Fitting 3 folds for each of 324 candidates, totalling 972 fits
# Best Parameters: {'boost_from_average': False, 'boosting_type': 'gbdt', 'feature_fraction': 0.5, 'learning_rate': 0.05, 'max_bin': 150, 'metric': 'rmse', 'min_data_in_leaf': 4095, 'n_estimators': 1500, 'num_leaves': 2047, 'objective': 'tweedie', 'subsample': 0.6, 'subsample_freq': 1, 'tweedie_variance_power': 1.1, 'verbose': -1}
# 0.37494410377792314
# (69.0, 118.80495014990751)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Parameters: {'boost_from_average': False, 'boosting_type': 'gbdt', 'feature_fraction': 0.4, 'learning_rate': 0.05, 'max_bin': 100, 'metric': 'rmse', 'min_data_in_leaf': 4095, 'n_estimators': 1500, 'num_leaves': 2047, 'objective': 'tweedie', 'subsample': 0.6, 'subsample_freq': 1, 'tweedie_variance_power': 1.1, 'verbose': -1}
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Parameters: {'boost_from_average': False, 'boosting_type': 'gbdt', 'feature_fraction': 0.4, 'learning_rate': 0.05, 'max_bin': 100, 'metric': 'rmse', 'min_data_in_leaf': 4095, 'n_estimators': 1500, 'num_leaves': 2047, 'objective': 'tweedie', 'subsample': 0.6, 'subsample_freq': 1, 'tweedie_variance_power': 1.1, 'verbose': -1}
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Parameters: {'boost_from_average': False, 'boosting_type': 'gbdt', 'feature_fraction': 0.5, 'learning_rate': 0.05, 'max_bin': 150, 'metric': 'rm

In [22]:
# from lazypredict.Supervised import LazyRegressor

# reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
# models, predictions = reg.fit(X_train, X_test, y_train, y_test)

# models.sort_values(by='RMSE')

 24%|██▍       | 10/42 [01:10<03:22,  6.32s/it]

GammaRegressor model failed to execute
Some value(s) of y are out of the valid range of the loss 'HalfGammaLoss'.


 40%|████      | 17/42 [17:50<21:19, 51.19s/it]   

Lars model failed to execute
Input contains NaN.


 98%|█████████▊| 41/42 [25:40<00:05,  5.12s/it] 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038995 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45998
[LightGBM] [Info] Number of data points in the train set: 35426, number of used features: 201
[LightGBM] [Info] Start training from score 1.286315


100%|██████████| 42/42 [25:41<00:00, 36.71s/it]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LinearSVR,0.39,0.46,1.17,27.26
LGBMRegressor,0.36,0.44,1.19,1.55
HistGradientBoostingRegressor,0.3,0.38,1.25,2.01
GradientBoostingRegressor,0.29,0.37,1.25,39.66
RandomForestRegressor,0.28,0.36,1.27,15.52
NuSVR,0.28,0.36,1.27,296.81
SVR,0.27,0.35,1.28,25.52
Lasso,0.24,0.33,1.3,0.54
LassoLars,0.24,0.33,1.3,0.38
BaggingRegressor,0.2,0.29,1.33,4.48
