In [73]:
import numpy as np
import pandas as pd
import os, gc, datetime, pickle, warnings
import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

from sklearn.svm import LinearSVR
from sklearn.linear_model import TweedieRegressor

import lightgbm as lgb

from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as sf

pd.DataFrame.iteritems = pd.DataFrame.items

warnings.filterwarnings('ignore')

In [74]:
dir_ = 'D:/Github/knowledge/time-series/data/daikin/sell-in/'

raw_data_dir = dir_
processed_data_dir = dir_+'processed/'
model_data_dir = dir_+'model/'
predict_data_dir = dir_+'predict/'

os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(processed_data_dir, exist_ok=True)

SEED = 42
TARGET_COL = 'qty'
DATE_COL = 'date'
P_HORIZON = 12
STEP = 4

group_columns = ['item','department']

COMMON_LAGS = [26,39,52,78,104]
LAGS = list(range(STEP,STEP*3+P_HORIZON+1))
ROLL_LAGS = [4,8,12,16,20,26,39,52]
ROLL_WINDOWS = [4,8,12,16,26,39,52]
EWM_LAGS = [4,8,12,16,20,26,39,52]
EWM_ALPHAS = [0.99, 0.95, 0.9, 0.8, 0.5, 0.2, 0.1]
ENC_COLS = [
    ['item'],
    ['department'],
    ['item','department'],
]

In [75]:
# remove_features = [
#     'item',
#     'location_name'
# ] + [TARGET_COL, DATE_COL]
# features = [c for c in df.columns if c not in remove_features]

In [76]:
spark = SparkSession.builder \
    .appName("Training Models Week v1") \
    .master("local[*]") \
    .config("spark.driver.memory", "64g") \
    .getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [77]:
# df = spark.read.csv(f'{processed_data_dir}features_data.csv', header=True, inferSchema=True)
# df.write.format('parquet').mode('overwrite').save(f'{processed_data_dir}features_data')

In [78]:
df = spark.read.csv(f'{processed_data_dir}features_data.csv', header=True, inferSchema=True)
# df = spark.read.parquet(f'{processed_data_dir}features_data')
df = df.filter('date < "2024-04-01"')
df = df.withColumn('partition_key', sf.col('department'))
# df = df.withColumn('partition_key', sf.concat(sf.col('department'),sf.lit('_'),sf.col('item')))
partition_keys = df.filter('date < "2024-01-01"').select('partition_key').drop_duplicates().toPandas()['partition_key'].to_list()
df = df.filter(sf.col('partition_key').isin(partition_keys))
inputs = df.repartition(spark.sparkContext.defaultParallelism, 'partition_key').cache()

In [79]:
other_features = [
    'is_holiday',
    'is_offline',
    'year',
    'week_of_year',
    'release',
    'department_le',
    'item_le',
 ]

stages = []
for i, start in enumerate(range(1, P_HORIZON + 1, STEP), 1):
    end = start + STEP - 1
    if end > P_HORIZON:
        end = P_HORIZON  # To ensure end doesn't exceed P_HORIZON

    _lags = list(range(i*STEP, i*STEP+STEP*3+1,1)) + COMMON_LAGS
    _lag_cols = [f'{TARGET_COL}_lag_{lag}' for lag in _lags]
    _roll_cols = []
    for lag in [l for l in ROLL_LAGS if l>=i*STEP]:
        for window in ROLL_WINDOWS:
            _roll_cols.append(f'{TARGET_COL}_roll_mean_lag_{lag}_win_{window}')
            _roll_cols.append(f'{TARGET_COL}_roll_std_lag_{lag}_win_{window}')
    _ewm_cols = []
    for lag in [l for l in EWM_LAGS if l>=i*STEP]:
        for alpha in EWM_ALPHAS:
            _ewm_cols.append(f'{TARGET_COL}_ewm_lag_{lag}_alpha_{str(alpha).replace(".", "")}')
    _enc_cols = []
    for col in ENC_COLS:
        col_name = '_'.join(col)
        _enc_cols.append(f'enc_{col_name}_mean')
        _enc_cols.append(f'enc_{col_name}_std')
    stages.append({
        "stage": i, "start": start, "end": end, 
        "other_features": other_features,
        "lag_features": _lag_cols,
        "roll_features": _roll_cols,
        "ewm_features": _ewm_cols,
        "enc_features": _enc_cols,
    })

# print(stages)

In [None]:
params = {
    "stage_1": {
        "boost_from_average": False,
        "boosting_type": "gbdt",
        "feature_fraction": 0.4,
        "learning_rate": 0.05,
        "max_bin": 100,
        "metric": "rmse",
        "min_data_in_leaf": 2 ** 12 - 1,
        "num_leaves": 2 ** 11 - 1,
        "n_estimators": 1500,
        "objective": "tweedie",
        "subsample": 0.6,
        "subsample_freq": 1,
        "tweedie_variance_power": 1.2,
        "verbose": -1,
    },
    "stage_2": {
        "boost_from_average": False,
        "boosting_type": "gbdt",
        "feature_fraction": 0.6,
        "learning_rate": 0.05,
        "max_bin": 150,
        "metric": "rmse",
        "min_data_in_leaf": 2 ** 12 - 1,
        "num_leaves": 2 ** 11 - 1,
        "n_estimators": 1500,
        "objective": "tweedie",
        "subsample": 0.6,
        "subsample_freq": 5,
        "tweedie_variance_power": 1.1,
        "verbose": -1,
    },
    "stage_3": {
        "boost_from_average": False,
        "boosting_type": "gbdt",
        "feature_fraction": 0.6,
        "learning_rate": 0.05,
        "max_bin": 100,
        "metric": "rmse",
        "min_data_in_leaf": 2 ** 12 - 1,
        "num_leaves": 2 ** 11 - 1,
        "n_estimators": 1500,
        "objective": "tweedie",
        "subsample": 0.6,
        "subsample_freq": 1,
        "tweedie_variance_power": 1.2,
        "verbose": -1,
    },
}


In [None]:
def get_forecast(df: pd.DataFrame ) -> pd.DataFrame:
    
    label_cols = ['item']
    # label_cols = ['item','department']
    le = LabelEncoder()
    for c in label_cols:
        df[f'{c}_le'] = le.fit_transform(df[c])

    CUT_OFF = df[DATE_COL].max() - pd.Timedelta(weeks=P_HORIZON)
    partition_key = df.loc[df.index.min(),'partition_key']
    list_predict_pd = []
    for stage in stages:
        features = [c for k in stage.keys() if k.endswith('features') for c in stage.get(k)]
        stage_number, start, end = stage.get('stage'), stage.get('start'), stage.get('end')
        cut_off_start = CUT_OFF + pd.Timedelta(weeks=start)
        cut_off_end = CUT_OFF + pd.Timedelta(weeks=end)

        idx_train, idx_test = (df[DATE_COL]<=CUT_OFF), (df[DATE_COL]>=cut_off_start) & (df[DATE_COL]<=cut_off_end)
        X, y = df[features], df[TARGET_COL]
        X_train, y_train = X[idx_train], y[idx_train]
        X_test, y_test = X[idx_test], y[idx_test]

        # best_params = params.get(f'stage_{stage_number}')
        best_params = {
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'subsample': 0.4,
            'subsample_freq': 1,
            'learning_rate': 0.01,
            'num_leaves': 2 ** 11 - 1,
            'min_data_in_leaf': 2 ** 12 - 1,
            'feature_fraction': 0.4,
            'max_bin': 50,
            'n_estimators': 1500,
            'boost_from_average': False,
            'verbose': -1,
        }
        
        model = lgb.LGBMRegressor(**best_params,random_state=SEED,n_jobs=-1).fit(X_train, y_train, eval_metric=root_mean_squared_error)
        model.fit(X_train, y_train)

        _model_data_dir = f'{model_data_dir}{partition_key}/'
        os.makedirs(_model_data_dir, exist_ok=True)
        _model_path = f'{_model_data_dir}lgb_model_stage_{stage_number}.bin'
        pickle.dump(model, open(_model_path, 'wb'))

        y_train_lgb = model.predict(X_train)
        y_residuals = y_train-y_train_lgb

        best_params_2 = {"max_depth": None, "max_features": 0.5, "min_samples_leaf": 50, "min_samples_split": 20, "n_estimators": 200, "max_samples": 0.5}
        model_2 = RandomForestRegressor(
            **best_params_2,
            n_jobs=-1,
            random_state=SEED,
        )
        model_2.fit(X_train, y_residuals)

        _model_data_dir = f'{model_data_dir}{partition_key}/'
        os.makedirs(_model_data_dir, exist_ok=True)
        _model_path = f'{_model_data_dir}rf_model_stage_{stage_number}.bin'
        pickle.dump(model_2, open(_model_path, 'wb'))


        y_pred_1 = model.predict(X_test)
        y_pred_2 = model_2.predict(X_test)
        y_pred = y_pred_1 + y_pred_2
        pred_col = f'{TARGET_COL}_hat'
        predict_pd = pd.DataFrame({
            pred_col : y_pred
            }, index=X_test.index)
        predict_pd.loc[predict_pd[pred_col] < 0, pred_col] = 0
        predict_pd = predict_pd.join(df)[group_columns+[DATE_COL,TARGET_COL,pred_col,'partition_key']]
        predict_pd['stage'] = stage_number
        list_predict_pd.append(predict_pd)
        # break
    concat_predict_pd = pd.concat(list_predict_pd)
    return concat_predict_pd

In [None]:
# structure of the dataset returned by the function
result_schema =StructType([
    StructField('item',StringType()),
    StructField('department',StringType()),
    StructField('date',TimestampType()),
    StructField('qty', FloatType()),
    StructField('qty_hat', FloatType()),
    # StructField('qty_actual', FloatType()),
    StructField('partition_key', StringType()),
    StructField('stage', IntegerType()),
    # StructField('best_params', StringType()),
])
forecast = (
  inputs
    .groupBy('partition_key')
    .applyInPandas(get_forecast, schema=result_schema)
    .withColumn('training_date', sf.current_date())
)
# .show()
# COMMAND ----------

In [None]:
predict_df = spark.read.parquet(f'{predict_data_dir}sellin_v6')
predict_pdf = predict_df.toPandas()

# predict_pdf = get_forecast(inputs.toPandas())

y_actual = predict_pdf.qty
y_predict = predict_pdf.qty_hat
print(root_mean_squared_error(y_actual,y_predict))
print(sum(y_actual))
print(sum(y_predict))

# 101.624146
# 65695.0
# 129633.85832388718

rmse_results = predict_pdf.groupby('department').agg(
    rmse=('qty', lambda x: root_mean_squared_error(x, predict_pdf.loc[x.index, 'qty_hat']))
)

display(rmse_results.sort_values('rmse', ascending=False))

In [64]:
dmx_df = inputs.toPandas()
dmx_df = dmx_df[dmx_df['partition_key'] == 'O2X']

def get_forecast(df: pd.DataFrame ) -> pd.DataFrame:
    
    label_cols = ['item']
    # label_cols = ['item','department']
    le = LabelEncoder()
    for c in label_cols:
        df[f'{c}_le'] = le.fit_transform(df[c])

    CUT_OFF = df[DATE_COL].max() - pd.Timedelta(weeks=P_HORIZON)
    partition_key = df.loc[df.index.min(),'partition_key']
    list_predict_pd = []
    for stage in stages:
        features = [c for k in stage.keys() if k.endswith('features') for c in stage.get(k)]
        stage_number, start, end = stage.get('stage'), stage.get('start'), stage.get('end')
        cut_off_start = CUT_OFF + pd.Timedelta(weeks=start)
        cut_off_end = CUT_OFF + pd.Timedelta(weeks=end)

        idx_train, idx_test = (df[DATE_COL]<=CUT_OFF), (df[DATE_COL]>=cut_off_start) & (df[DATE_COL]<=cut_off_end)
        X, y = df[features], df[TARGET_COL]
        X_train, y_train = X[idx_train].fillna(0), y[idx_train]
        X_test, y_test = X[idx_test].fillna(0), y[idx_test]

        # Step 1: Standardize the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # best_params = params.get(f'stage_{stage_number}')
        best_params = {
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'subsample': 0.5,
            'subsample_freq': 1,
            'learning_rate': 0.01,
            'num_leaves': 2 ** 11 - 1,
            'min_data_in_leaf': 2 ** 12 - 1,
            'feature_fraction': 0.5,
            'max_bin': 100,
            'n_estimators': 1500,
            'boost_from_average': False,
            'verbose': -1,
        }
        best_params_rf = {'max_depth': None, 'max_features': 0.5, 'min_samples_leaf': 50, 'min_samples_split': 5, 'n_estimators': 200}

        # model = lgb.LGBMRegressor(**best_params,random_state=SEED,n_jobs=-1).fit(X_train, y_train, eval_metric=root_mean_squared_error)
        # model = LinearSVR(random_state=SEED)
        model = TweedieRegressor()
        model.fit(X_train, y_train)
        # model.fit(X_train_scaled, y_train)

        _model_data_dir = f'{model_data_dir}{partition_key}/'
        os.makedirs(_model_data_dir, exist_ok=True)
        _model_path = f'{_model_data_dir}lgb_model_stage_{stage_number}.bin'
        pickle.dump(model, open(_model_path, 'wb'))

        y_train_lgb = model.predict(X_train)
        y_residuals = y_train-y_train_lgb

        model_2 = RandomForestRegressor(**best_params_rf, n_jobs=-1, random_state=SEED,)
        model_2.fit(X_train, y_residuals)
        # model_2.fit(X_train_scaled, y_residuals)

        _model_data_dir = f'{model_data_dir}{partition_key}/'
        os.makedirs(_model_data_dir, exist_ok=True)
        _model_path = f'{_model_data_dir}rf_model_stage_{stage_number}.bin'
        pickle.dump(model_2, open(_model_path, 'wb'))


        y_pred_1 = model.predict(X_test)
        y_pred_2 = model_2.predict(X_test)
        # y_pred_1 = model.predict(X_test_scaled)
        # y_pred_2 = model_2.predict(X_test_scaled)
        y_pred = y_pred_1 + y_pred_2
        # y_pred = model.predict(X_test)
        # y_pred = model.predict(X_test_scaled)
        pred_col = f'{TARGET_COL}_hat'
        predict_pd = pd.DataFrame({
            pred_col : y_pred
            }, index=X_test.index)
        predict_pd = predict_pd.join(df)[group_columns+[DATE_COL,TARGET_COL,pred_col,'partition_key']]
        predict_pd['stage'] = stage_number
        list_predict_pd.append(predict_pd)
        # break
    concat_predict_pd = pd.concat(list_predict_pd)
    return concat_predict_pd

predict_pdf = get_forecast(dmx_df)

y_actual = predict_pdf.qty
y_predict = predict_pdf.qty_hat
print(root_mean_squared_error(y_actual,y_predict))
print(sum(y_actual))
print(sum(y_predict))

# 210.03851787142713
# 65689.0
# 91740.41108703778

376.9969506897631
65689.0
150926.23188995893


In [None]:
rmse_results = predict_pdf.groupby('department').agg(
    rmse=('qty', lambda x: root_mean_squared_error(x, predict_pdf.loc[x.index, 'qty_hat']))
)

display(rmse_results.sort_values('rmse', ascending=False))

In [None]:
predict_pdf.to_csv(f'{predict_data_dir}sellin_v2/data.csv', index=False)

In [None]:
# predict_pdf.date.min()
actual_df = inputs.toPandas()
actual_df = actual_df[(actual_df['date']>='2023-01-01') & (actual_df['date']<'2024-06-11')]
actual_df[group_columns+[DATE_COL,TARGET_COL]].to_csv(f'{predict_data_dir}sellin_v2/actual_data.csv', index=False)

In [None]:
df = pd.read_csv(f'{processed_data_dir}features_data.csv')
df[DATE_COL] = pd.to_datetime(df[DATE_COL])

CUT_OFF = df[DATE_COL].max() - pd.Timedelta(weeks=P_HORIZON)
CUT_OFF

# df = spark.read.csv(f'{processed_data_dir}features_data.csv', header=True, inferSchema=True)
# df = df.withColumn('partition_key', sf.col('department'))
# df = df.filter("date between '2023-01-01' and '2024-01-02'")
# df.select(group_columns+[DATE_COL,TARGET_COL,'partition_key']).write.format('parquet').mode('append').save(f'{predict_data_dir}sellin_v1')

In [None]:
df = inputs.toPandas()
# df = pd.read_csv(f'{processed_data_dir}features_data.csv')
# df[DATE_COL] = pd.to_datetime(df[DATE_COL])
# df['partition_key'] = df['location_name']
# df = df[df['location_name']=='Nam Định']
df = df[df['department']=='O2X']

import sys
# Get the parent directory
parent_dir = os.path.dirname(os.getcwd())
# Add the parent directory to sys.path
sys.path.append(parent_dir)

from utils import *

df = Util.reduce_mem_usage(df)


label_cols = ['item']
# label_cols = ['item','department']
le = LabelEncoder()
for c in label_cols:
    df[f'{c}_le'] = le.fit_transform(df[c])

n_count = df[label_cols].drop_duplicates().shape[0]
step=3
tscv = TimeSeriesSplit(n_splits=3, test_size=n_count*step)

CUT_OFF = df[DATE_COL].max() - pd.Timedelta(weeks=P_HORIZON)
partition_key = df.loc[df.index.min(),'partition_key']
list_predict_pd = []
for stage in stages:
    features = [c for k in stage.keys() if k.endswith('features') for c in stage.get(k)]
    stage_number, start, end = stage.get('stage'), stage.get('start'), stage.get('end')
    cut_off_start = CUT_OFF + pd.Timedelta(weeks=start)
    cut_off_end = CUT_OFF + pd.Timedelta(weeks=end)

    idx_train, idx_test = (df[DATE_COL]<=CUT_OFF), (df[DATE_COL]>=cut_off_start) & (df[DATE_COL]<=cut_off_end)
    X, y = df[features], df[TARGET_COL]
    X_train, y_train = X[idx_train], y[idx_train]
    X_test, y_test = X[idx_test], y[idx_test]

    best_params = {
            'boosting_type': 'gbdt',
            'objective': 'tweedie',
            'tweedie_variance_power': 1.1,
            'metric': 'rmse',
            'subsample': 0.4,
            'subsample_freq': 1,
            'learning_rate': 0.01,
            'num_leaves': 2 ** 11 - 1,
            'min_data_in_leaf': 2 ** 12 - 1,
            'feature_fraction': 0.4,
            'max_bin': 50,
            'n_estimators': 1500,
            'boost_from_average': False,
            'verbose': -1,
        }
        
    model = lgb.LGBMRegressor(**best_params,random_state=SEED,n_jobs=-1).fit(X_train, y_train, eval_metric=root_mean_squared_error)
    model.fit(X_train, y_train)

    y_train_lgb = model.predict(X_train)
    y_residuals = y_train-y_train_lgb

    # param_grid = {
    #     'boosting_type': ['gbdt'],  # Tweedie with 'gbdt' or 'dart' boosting methods
    #     'objective': ['tweedie'],
    #     'tweedie_variance_power': [1.1, 1.2],  # Adjust power for Tweedie distribution
    #     'metric': ['rmse'],
    #     'learning_rate': [0.01, 0.03, 0.05, 0.1],  # Fine-tuning around the default 0.03
    #     'subsample': [0.4, 0.5, 0.6],  # Control sample size per tree
    #     'subsample_freq': [1, 5],  # Frequency of subsampling
    #     'num_leaves': [2 ** 11 - 1],  # Range around current num_leaves
    #     'min_data_in_leaf': [2 ** 12 - 1],  # Control leaf size
    #     'feature_fraction': [0.4, 0.5, 0.6],  # Percentage of features used per iteration
    #     'max_bin': [50, 100, 150],  # Range around current max_bin
    #     'n_estimators': [1500],  # Control the number of boosting iterations
    #     'boost_from_average': [False],
    #     'verbose': [-1]
    # }
    # model = lgb.LGBMRegressor(**best_params,random_state=SEED,n_jobs=-1).fit(X_train, y_train, eval_metric=root_mean_squared_error)
    # best_params = {"max_depth": None, "max_features": 0.8, "min_samples_leaf": 5, "min_samples_split": 2, "n_estimators": 200, "max_samples": 0.4}
    # model = RandomForestRegressor(
    #         **best_params,
    #         n_jobs=-1,
    #         random_state=SEED,
    #     )

    # lgb_model = lgb.LGBMRegressor(**best_params,random_state=SEED,n_jobs=-1)
    # lgb_model = lgb.LGBMRegressor(random_state=SEED,n_jobs=-1)
    # # Perform grid search
    # model = GridSearchCV(estimator=lgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=tscv, verbose=1)

    # best_params_2 = {"max_depth": None, "max_features": 0.8, "min_samples_leaf": 5, "min_samples_split": 2, "n_estimators": 200, "max_samples": 0.4}
    param_grid = {
    'n_estimators': [200, 300],
    'max_depth': [None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [2, 4, 10],
    'max_features': ['auto', 'sqrt', 'log2', 0.5, 0.8, 0.9]
    }
    rf_model = RandomForestRegressor(random_state=SEED,n_jobs=-1)
    # Perform grid search
    model_2 = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=tscv, verbose=1)

    model_2.fit(X_train, y_residuals)
    print(f"Best Parameters: {model_2.best_params_}")

    y_pred_1 = model.predict(X_test)
    y_pred_2 = model_2.predict(X_test)
    y_pred = y_pred_1 + y_pred_2
    # y_pred = model.predict(X_test)
    pred_col = f'{TARGET_COL}_hat'
    predict_pd = pd.DataFrame({
        pred_col : y_pred
        }, index=X_test.index)
    predict_pd = predict_pd.join(df)[group_columns+[DATE_COL,TARGET_COL,pred_col,'partition_key']]
    # predict_pd = predict_pd.join(df)[group_columns+[DATE_COL,TARGET_COL,pred_col]]
    predict_pd['stage'] = stage_number
    list_predict_pd.append(predict_pd)
    # break
concat_predict_pd = pd.concat(list_predict_pd)


y_actual = concat_predict_pd.qty
y_predict = concat_predict_pd.qty_hat
print(root_mean_squared_error(y_actual,y_predict))
print((sum(y_actual), sum(y_predict)))
# 1.9183264231803465
# (421.0, 889.8702807955929)


In [81]:
dmx_df = inputs.toPandas()

# label_cols = ['item']
label_cols = ['item','department']
le = LabelEncoder()
for c in label_cols:
    dmx_df[f'{c}_le'] = le.fit_transform(dmx_df[c])

stage = stages[1]

features = [c for k in stage.keys() if k.endswith('features') for c in stage.get(k)]
stage_number, start, end = stage.get('stage'), stage.get('start'), stage.get('end')

c_cut_off = CUT_OFF + pd.Timedelta(weeks=-2)

idx_train, idx_test = (dmx_df[DATE_COL]<=CUT_OFF), (dmx_df[DATE_COL]>CUT_OFF)
X, y = dmx_df[features], dmx_df[TARGET_COL]
X_train, y_train = X[idx_train], y[idx_train]
X_test, y_test = X[idx_test], y[idx_test]

In [None]:
from lazypredict.Supervised import LazyRegressor

reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

models.sort_values(by='RMSE')

 24%|██▍       | 10/42 [03:03<08:06, 15.20s/it]

GammaRegressor model failed to execute
Some value(s) of y are out of the valid range of the loss 'HalfGammaLoss'.


 26%|██▌       | 11/42 [03:05<05:44, 11.11s/it]

GaussianProcessRegressor model failed to execute
Unable to allocate 312. GiB for an array with shape (204757, 204757) and data type float64


 38%|███▊      | 16/42 [06:19<08:41, 20.07s/it]

KernelRidge model failed to execute
Unable to allocate 312. GiB for an array with shape (204757, 204757) and data type float64


 60%|█████▉    | 25/42 [10:05<15:33, 54.89s/it]