In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta, date

from pathlib import Path

import holidays

from mlforecast import MLForecast
from mlforecast.lag_transforms import RollingMean, RollingStd
from lightgbm import LGBMRegressor

import optuna
from optuna.samplers import TPESampler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = Path("../Test-Task-for-DS-time-series-forecasting-2026-01/data")
prepared = data_dir / "data_prepared.csv"

print("prepared:", prepared.exists())

prepared: True


### Some prepare

In [3]:
df = pd.read_csv(prepared)
print(df.shape, "\n")
df.head()

(350730, 12) 



Unnamed: 0,unique_id,ds,y,day_of_week,month,day_of_month,year,week_of_year,is_weekend,holiday_type,is_any_holiday,all_holiday_name
0,0_FOODS_1_0,2014-01-01,23,2,1,1,2014,1,0,National,1,NewYear | New Year's Day
1,0_FOODS_1_0,2014-01-02,28,3,1,2,2014,1,0,,0,
2,0_FOODS_1_0,2014-01-03,43,4,1,3,2014,1,0,,0,
3,0_FOODS_1_0,2014-01-04,33,5,1,4,2014,1,1,,0,
4,0_FOODS_1_0,2014-01-05,32,6,1,5,2014,1,1,,0,


In [4]:
df['ds'] = pd.to_datetime(df['ds'])
df["all_holiday_name"] = df["all_holiday_name"].fillna("None")
df["holiday_type"] = df["holiday_type"].fillna("None")
df['unique_id'] = df['unique_id'].astype('category')
df['holiday_type'] = df['holiday_type'].astype('category')
df['all_holiday_name'] = df['all_holiday_name'].astype('category')
df[["holiday_type", "all_holiday_name"]].isna().sum()
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 350730 entries, 0 to 350729
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   unique_id         350730 non-null  category      
 1   ds                350730 non-null  datetime64[us]
 2   y                 350730 non-null  int64         
 3   day_of_week       350730 non-null  int64         
 4   month             350730 non-null  int64         
 5   day_of_month      350730 non-null  int64         
 6   year              350730 non-null  int64         
 7   week_of_year      350730 non-null  int64         
 8   is_weekend        350730 non-null  int64         
 9   holiday_type      350730 non-null  category      
 10  is_any_holiday    350730 non-null  int64         
 11  all_holiday_name  350730 non-null  category      
dtypes: category(3), datetime64[us](1), int64(8)
memory usage: 25.4 MB


## MLForecast + LightGBM

### Validation

In [5]:
cut_off_date = pd.Timestamp('2016-05-08') 
train_df = df[df['ds'] <= cut_off_date].copy()
valid_df = df[(df['ds'] > cut_off_date) & (df['ds'] <= pd.Timestamp('2016-05-15'))].copy()

In [6]:
lgbm = LGBMRegressor(n_estimators=1100, 
                     learning_rate=0.05,
                     num_leaves=134,
                     colsample_bytree=0.8,
                     subsample=0.9,
                     random_state=42,
                    #  objective='tweedie'
                    )

lags = [1, 7, 14, 28, 56, 112]

lag_transforms = {1: 
                  [
                      RollingMean(3),
                      RollingMean(7),
                      RollingMean(28),
                      RollingMean(56),
                      RollingStd(7),

                  ]
            }

In [7]:
X_valid = valid_df.drop(columns=['y'])

frct = MLForecast(
    models=[lgbm],
    lags=lags,
    lag_transforms=lag_transforms,
    freq='D'
)
frct.fit(train_df, 
         id_col='unique_id',
         time_col='ds', 
         target_col='y', 
         static_features=[]
        )

pred_valid = frct.predict(h=7, X_df=X_valid)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2505
[LightGBM] [Info] Number of data points in the train set: 302535, number of used features: 20
[LightGBM] [Info] Start training from score 6.033976


In [8]:
pred_col = [c for c in pred_valid.columns if c not in ['unique_id', 'ds']][0]
check = valid_df.merge(pred_valid, on=['unique_id', 'ds'], how='inner').copy()

check['y_pred'] = check[pred_col].clip(lower=0).round().astype('int64')

weekly = (check
          .groupby('unique_id', as_index=False)
          .agg(y_true=('y', 'sum'),
               y_pred=('y_pred', 'sum')))

denom = weekly['y_true'].abs().clip(lower=1)   # якщо y_true=0 -> 1
weekly['ape'] = (weekly['y_true'] - weekly['y_pred']).abs() / denom

mape_weekly = weekly['ape'].mean() * 100
print("Weekly MAPE (%):", mape_weekly)
print("How many series:", weekly['unique_id'].nunique(), "rows:", len(weekly))


Weekly MAPE (%): 47.153374620816116
How many series: 405 rows: 405


### OPTUNA

In [9]:
cut_off_date = pd.Timestamp('2016-05-08') 
train_df = df[df['ds'] <= cut_off_date].copy()
valid_df = df[(df['ds'] > cut_off_date) & (df['ds'] <= pd.Timestamp('2016-05-15'))].copy()

In [10]:
# optuna
def weekly_mape(y_true, y_pred, eps=1e-8):
    mask = y_true > 0
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    if len(y_true) == 0:
        return np.inf
    return np.mean(np.abs(y_true - y_pred) / np.maximum(np.abs(y_true), eps)) * 100

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 800, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.07),
        "num_leaves": trial.suggest_int("num_leaves", 80, 255),
        "max_depth": trial.suggest_int("max_depth", 5, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 200),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.0, 1.0),

        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),

        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 2.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 2.0),

        "random_state": 42,
        "objective": "poisson",
        "n_jobs": -1,
        "verbosity": -1,
    }

    lgbm_opt = LGBMRegressor(**params)
    
    frct_opt = MLForecast(
        models=[lgbm_opt],
        lags=lags,
        lag_transforms=lag_transforms,
        freq='D',
        )
    
    frct_opt.fit(train_df, id_col='unique_id', time_col='ds', target_col='y', static_features=[])

    X_valid = valid_df.drop(columns=['y'])
    pred = frct_opt.predict(h=7, X_df=X_valid)

    pred_col = [c for c in pred.columns if c not in ['unique_id', 'ds']][0]

    pred_adj = pred.copy()
    # pred_adj[pred_col] = pred_adj[pred_col].clip(lower=0).round()
    pred_adj[pred_col] = pred_adj[pred_col].clip(lower=0)
    y_pred_week = pred_adj.groupby('unique_id')[pred_col].sum()
    y_true_week = valid_df.groupby('unique_id')['y'].sum()

    y_true_week = y_true_week.reindex(y_pred_week.index)
    if trial.number == 0:
        fitted_model = frct_opt.models_['LGBMRegressor']
        print("GET_PARAMS:", fitted_model.get_params())
        print("BOOSTER_PARAMS:", fitted_model.booster_.params)

    return weekly_mape(y_true_week.values, y_pred_week.values)


In [11]:
study = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner(n_startup_trials=10)
)

study.optimize(objective, n_trials=20, show_progress_bar=True)

print("Best MAPE:", study.best_value)
print("Best params:", study.best_params)


[32m[I 2026-01-29 19:38:24,882][0m A new study created in memory with name: no-name-a3056ff6-754e-4dcd-917a-1437dcd38169[0m
Best trial: 0. Best value: 37.6111:   5%|▌         | 1/20 [00:27<08:43, 27.58s/it]

GET_PARAMS: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.0675357153204958, 'max_depth': 9, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 1249, 'n_jobs': -1, 'num_leaves': 208, 'objective': 'poisson', 'random_state': 42, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'min_data_in_leaf': 39, 'min_gain_to_split': 0.15599452033620265, 'feature_fraction': 0.6232334448672797, 'bagging_fraction': 0.9464704583099741, 'bagging_freq': 7, 'lambda_l1': 1.416145155592091, 'lambda_l2': 0.041168988591604894, 'verbosity': -1}
BOOSTER_PARAMS: {'boosting_type': 'gbdt', 'colsample_bytree': 1.0, 'learning_rate': 0.0675357153204958, 'max_depth': 9, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'num_leaves': 208, 'random_state': 42, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bi

Best trial: 0. Best value: 37.6111:  10%|█         | 2/20 [00:44<06:27, 21.54s/it]

[32m[I 2026-01-29 19:39:09,783][0m Trial 1 finished with value: 37.755077011231336 and parameters: {'n_estimators': 1964, 'learning_rate': 0.06162213204002109, 'num_leaves': 117, 'max_depth': 6, 'min_data_in_leaf': 45, 'min_gain_to_split': 0.3042422429595377, 'feature_fraction': 0.8099025726528951, 'bagging_fraction': 0.7727780074568463, 'bagging_freq': 3, 'lambda_l1': 1.223705789444759, 'lambda_l2': 0.27898772130408367}. Best is trial 0 with value: 37.61113308598099.[0m


Best trial: 0. Best value: 37.6111:  15%|█▌        | 3/20 [01:09<06:32, 23.07s/it]

[32m[I 2026-01-29 19:39:34,673][0m Trial 2 finished with value: 37.78772096497939 and parameters: {'n_estimators': 1150, 'learning_rate': 0.03831809216468458, 'num_leaves': 160, 'max_depth': 11, 'min_data_in_leaf': 48, 'min_gain_to_split': 0.5142344384136116, 'feature_fraction': 0.836965827544817, 'bagging_fraction': 0.6185801650879991, 'bagging_freq': 7, 'lambda_l1': 0.34104824737458306, 'lambda_l2': 0.13010318597055903}. Best is trial 0 with value: 37.61113308598099.[0m


Best trial: 0. Best value: 37.6111:  20%|██        | 4/20 [01:32<06:04, 22.78s/it]

[32m[I 2026-01-29 19:39:57,002][0m Trial 3 finished with value: 37.93158836440821 and parameters: {'n_estimators': 1939, 'learning_rate': 0.06828160165372797, 'num_leaves': 222, 'max_depth': 7, 'min_data_in_leaf': 28, 'min_gain_to_split': 0.6842330265121569, 'feature_fraction': 0.7760609974958405, 'bagging_fraction': 0.6488152939379115, 'bagging_freq': 5, 'lambda_l1': 0.06877704223043679, 'lambda_l2': 1.8186408041575641}. Best is trial 0 with value: 37.61113308598099.[0m


Best trial: 4. Best value: 37.2726:  25%|██▌       | 5/20 [01:49<05:13, 20.87s/it]

[32m[I 2026-01-29 19:40:14,482][0m Trial 4 finished with value: 37.2726282145666 and parameters: {'n_estimators': 1110, 'learning_rate': 0.053126114217699104, 'num_leaves': 134, 'max_depth': 9, 'min_data_in_leaf': 114, 'min_gain_to_split': 0.18485445552552704, 'feature_fraction': 0.9878338511058234, 'bagging_fraction': 0.9100531293444458, 'bagging_freq': 10, 'lambda_l1': 1.7896547008552977, 'lambda_l2': 1.1957999576221703}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  30%|███       | 6/20 [02:03<04:17, 18.39s/it]

[32m[I 2026-01-29 19:40:28,071][0m Trial 5 finished with value: 37.71962764642257 and parameters: {'n_estimators': 1907, 'learning_rate': 0.024424625102595975, 'num_leaves': 114, 'max_depth': 5, 'min_data_in_leaf': 72, 'min_gain_to_split': 0.388677289689482, 'feature_fraction': 0.7085396127095583, 'bagging_fraction': 0.9314950036607718, 'bagging_freq': 4, 'lambda_l1': 0.5618690193747615, 'lambda_l2': 1.085392166316497}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  35%|███▌      | 7/20 [02:21<04:00, 18.50s/it]

[32m[I 2026-01-29 19:40:46,792][0m Trial 6 finished with value: 37.48927997140291 and parameters: {'n_estimators': 969, 'learning_rate': 0.06010984903770199, 'num_leaves': 93, 'max_depth': 12, 'min_data_in_leaf': 157, 'min_gain_to_split': 0.1987156815341724, 'feature_fraction': 0.602208846849441, 'bagging_fraction': 0.9261845713819337, 'bagging_freq': 8, 'lambda_l1': 1.4580143360819746, 'lambda_l2': 1.5425406933718915}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  40%|████      | 8/20 [02:37<03:30, 17.54s/it]

[32m[I 2026-01-29 19:41:02,277][0m Trial 7 finished with value: 37.95810693312952 and parameters: {'n_estimators': 888, 'learning_rate': 0.03792328642721363, 'num_leaves': 100, 'max_depth': 11, 'min_data_in_leaf': 129, 'min_gain_to_split': 0.3308980248526492, 'feature_fraction': 0.6254233401144095, 'bagging_fraction': 0.7243929286862648, 'bagging_freq': 4, 'lambda_l1': 1.4592123566761281, 'lambda_l2': 1.2751149427104262}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  45%|████▌     | 9/20 [03:05<03:47, 20.73s/it]

[32m[I 2026-01-29 19:41:30,008][0m Trial 8 finished with value: 37.78660813742956 and parameters: {'n_estimators': 1865, 'learning_rate': 0.043610746258097466, 'num_leaves': 101, 'max_depth': 10, 'min_data_in_leaf': 155, 'min_gain_to_split': 0.5612771975694962, 'feature_fraction': 0.9083868719818244, 'bagging_fraction': 0.7975182385457563, 'bagging_freq': 6, 'lambda_l1': 0.8550820367170993, 'lambda_l2': 0.05083825348819038}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  50%|█████     | 10/20 [03:16<02:56, 17.69s/it]

[32m[I 2026-01-29 19:41:40,910][0m Trial 9 finished with value: 37.94916599128957 and parameters: {'n_estimators': 929, 'learning_rate': 0.02157145928433671, 'num_leaves': 192, 'max_depth': 7, 'min_data_in_leaf': 107, 'min_gain_to_split': 0.907566473926093, 'feature_fraction': 0.69971689165955, 'bagging_fraction': 0.7641531692142519, 'bagging_freq': 8, 'lambda_l1': 0.4575963309832449, 'lambda_l2': 0.15395981965758598}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  55%|█████▌    | 11/20 [03:34<02:42, 18.02s/it]

[32m[I 2026-01-29 19:41:59,664][0m Trial 10 finished with value: 38.15990922975043 and parameters: {'n_estimators': 1665, 'learning_rate': 0.0545943118356076, 'num_leaves': 250, 'max_depth': 8, 'min_data_in_leaf': 192, 'min_gain_to_split': 0.03872519996118312, 'feature_fraction': 0.9892311998902633, 'bagging_fraction': 0.870628951897112, 'bagging_freq': 10, 'lambda_l1': 1.984725330920674, 'lambda_l2': 0.5615219084794083}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  60%|██████    | 12/20 [03:56<02:32, 19.02s/it]

[32m[I 2026-01-29 19:42:20,970][0m Trial 11 finished with value: 37.79373734787671 and parameters: {'n_estimators': 1097, 'learning_rate': 0.05545228559554789, 'num_leaves': 149, 'max_depth': 12, 'min_data_in_leaf': 160, 'min_gain_to_split': 0.01129343399230634, 'feature_fraction': 0.9945076478961321, 'bagging_fraction': 0.8633481755095029, 'bagging_freq': 10, 'lambda_l1': 1.8690089756804005, 'lambda_l2': 1.5816715964425572}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  65%|██████▌   | 13/20 [04:17<02:17, 19.68s/it]

[32m[I 2026-01-29 19:42:42,162][0m Trial 12 finished with value: 37.49563370381783 and parameters: {'n_estimators': 1378, 'learning_rate': 0.052634060011249054, 'num_leaves': 83, 'max_depth': 12, 'min_data_in_leaf': 92, 'min_gain_to_split': 0.22713901165087869, 'feature_fraction': 0.8742665937306486, 'bagging_fraction': 0.9959867842793767, 'bagging_freq': 9, 'lambda_l1': 1.7006306512333405, 'lambda_l2': 1.4571127005915012}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  70%|███████   | 14/20 [04:29<01:43, 17.31s/it]

[32m[I 2026-01-29 19:42:54,017][0m Trial 13 finished with value: 37.76383960422002 and parameters: {'n_estimators': 1051, 'learning_rate': 0.048290087312693346, 'num_leaves': 134, 'max_depth': 9, 'min_data_in_leaf': 146, 'min_gain_to_split': 0.1533541680089895, 'feature_fraction': 0.7437278301599928, 'bagging_fraction': 0.8663373657505511, 'bagging_freq': 1, 'lambda_l1': 1.5930673626707759, 'lambda_l2': 0.7999613987674299}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  75%|███████▌  | 15/20 [04:51<01:34, 18.88s/it]

[32m[I 2026-01-29 19:43:16,531][0m Trial 14 finished with value: 37.510560402299994 and parameters: {'n_estimators': 1460, 'learning_rate': 0.0605281967844935, 'num_leaves': 179, 'max_depth': 10, 'min_data_in_leaf': 189, 'min_gain_to_split': 0.4291662589160654, 'feature_fraction': 0.942800124661297, 'bagging_fraction': 0.9144838196105148, 'bagging_freq': 8, 'lambda_l1': 1.054111543542078, 'lambda_l2': 1.879620073899702}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  80%|████████  | 16/20 [05:03<01:07, 16.83s/it]

[32m[I 2026-01-29 19:43:28,610][0m Trial 15 finished with value: 37.546730125249844 and parameters: {'n_estimators': 815, 'learning_rate': 0.06197696347413638, 'num_leaves': 80, 'max_depth': 10, 'min_data_in_leaf': 120, 'min_gain_to_split': 0.6918774367494049, 'feature_fraction': 0.6812699712112874, 'bagging_fraction': 0.960085934713202, 'bagging_freq': 9, 'lambda_l1': 1.2179165607992086, 'lambda_l2': 0.9829062557868783}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  85%|████████▌ | 17/20 [05:19<00:49, 16.61s/it]

[32m[I 2026-01-29 19:43:44,693][0m Trial 16 finished with value: 37.950396559408276 and parameters: {'n_estimators': 1289, 'learning_rate': 0.04537458820696781, 'num_leaves': 137, 'max_depth': 8, 'min_data_in_leaf': 171, 'min_gain_to_split': 0.1750850405793089, 'feature_fraction': 0.8695692969019595, 'bagging_fraction': 0.8437231918310621, 'bagging_freq': 10, 'lambda_l1': 1.6835722940383384, 'lambda_l2': 1.5488454782953232}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  90%|█████████ | 18/20 [05:41<00:36, 18.05s/it]

[32m[I 2026-01-29 19:44:06,104][0m Trial 17 finished with value: 37.39701805123755 and parameters: {'n_estimators': 989, 'learning_rate': 0.030617396984464963, 'num_leaves': 126, 'max_depth': 11, 'min_data_in_leaf': 78, 'min_gain_to_split': 0.2832228186874048, 'feature_fraction': 0.9369005230609271, 'bagging_fraction': 0.9965108784574399, 'bagging_freq': 8, 'lambda_l1': 0.8926440169000723, 'lambda_l2': 1.2652607855246365}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726:  95%|█████████▌| 19/20 [06:07<00:20, 20.65s/it]

[32m[I 2026-01-29 19:44:32,817][0m Trial 18 finished with value: 37.358447060656566 and parameters: {'n_estimators': 1571, 'learning_rate': 0.03209951663626511, 'num_leaves': 131, 'max_depth': 11, 'min_data_in_leaf': 72, 'min_gain_to_split': 0.6058846602813074, 'feature_fraction': 0.9381506292218816, 'bagging_fraction': 0.989906016593321, 'bagging_freq': 9, 'lambda_l1': 0.7793570644679071, 'lambda_l2': 0.7087766217163594}. Best is trial 4 with value: 37.2726282145666.[0m


Best trial: 4. Best value: 37.2726: 100%|██████████| 20/20 [06:33<00:00, 19.70s/it]

[32m[I 2026-01-29 19:44:58,823][0m Trial 19 finished with value: 37.46236437579809 and parameters: {'n_estimators': 1583, 'learning_rate': 0.03168347003678216, 'num_leaves': 162, 'max_depth': 9, 'min_data_in_leaf': 10, 'min_gain_to_split': 0.9496289214145974, 'feature_fraction': 0.960630688242106, 'bagging_fraction': 0.8872556922528623, 'bagging_freq': 9, 'lambda_l1': 0.7175802683990034, 'lambda_l2': 0.5726238705881048}. Best is trial 4 with value: 37.2726282145666.[0m
Best MAPE: 37.2726282145666
Best params: {'n_estimators': 1110, 'learning_rate': 0.053126114217699104, 'num_leaves': 134, 'max_depth': 9, 'min_data_in_leaf': 114, 'min_gain_to_split': 0.18485445552552704, 'feature_fraction': 0.9878338511058234, 'bagging_fraction': 0.9100531293444458, 'bagging_freq': 10, 'lambda_l1': 1.7896547008552977, 'lambda_l2': 1.1957999576221703}





In [12]:
best_params = study.best_params
best_mape = study.best_value
print(best_mape)
print(best_params)

37.2726282145666
{'n_estimators': 1110, 'learning_rate': 0.053126114217699104, 'num_leaves': 134, 'max_depth': 9, 'min_data_in_leaf': 114, 'min_gain_to_split': 0.18485445552552704, 'feature_fraction': 0.9878338511058234, 'bagging_fraction': 0.9100531293444458, 'bagging_freq': 10, 'lambda_l1': 1.7896547008552977, 'lambda_l2': 1.1957999576221703}


### Prediction by day

In [33]:
exog_cols = [
    'day_of_week',
    'month',
    'day_of_month',
    'week_of_year',
    'is_weekend',
    'is_any_holiday',
    'all_holiday_name',
]

In [34]:
train_full = df[['unique_id', 'ds', 'y'] + exog_cols].copy()
train_full

Unnamed: 0,unique_id,ds,y,day_of_week,month,day_of_month,week_of_year,is_weekend,is_any_holiday,all_holiday_name
0,0_FOODS_1_0,2014-01-01,23,2,1,1,1,0,1,NewYear | New Year's Day
1,0_FOODS_1_0,2014-01-02,28,3,1,2,1,0,0,
2,0_FOODS_1_0,2014-01-03,43,4,1,3,1,0,0,
3,0_FOODS_1_0,2014-01-04,33,5,1,4,1,1,0,
4,0_FOODS_1_0,2014-01-05,32,6,1,5,1,1,0,
...,...,...,...,...,...,...,...,...,...,...
350725,3_HOUSEHOLD_2_180,2016-05-11,3,2,5,11,19,0,0,
350726,3_HOUSEHOLD_2_180,2016-05-12,2,3,5,12,19,0,0,
350727,3_HOUSEHOLD_2_180,2016-05-13,2,4,5,13,19,0,0,
350728,3_HOUSEHOLD_2_180,2016-05-14,1,5,5,14,19,1,0,


In [35]:
lgbm_best = LGBMRegressor(
    **best_params,
    random_state=42,
    objective='tweedie',
    tweedie_variance_power=1.1,
    n_jobs=-1,
    verbosity=-1,
)

In [36]:
frct = MLForecast(
    models=[lgbm_best],
    lags=lags,
    lag_transforms=lag_transforms,
    freq='D'
)
frct.fit(
    train_full,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    static_features=[]
)


MLForecast(models=[LGBMRegressor], freq=D, lag_features=['lag1', 'lag2', 'lag4', 'lag8', 'lag12', 'rolling_mean_lag1_window_size4', 'rolling_mean_lag1_window_size8', 'rolling_std_lag1_window_size4'], date_features=[], num_threads=1)

In [37]:
future = frct.make_future_dataframe(h=7)

# календарні фічі
future['day_of_week']  = future['ds'].dt.dayofweek
future['month']        = future['ds'].dt.month
future['day_of_month'] = future['ds'].dt.day
future['year']         = future['ds'].dt.year
future['week_of_year'] = future['ds'].dt.isocalendar().week.astype('int16')
future['is_weekend']   = future['day_of_week'].isin([5, 6]).astype('int8')

us_holidays = holidays.US(years=[2016])
future_dates = future['ds'].dt.date
future['us_holiday_name'] = future_dates.map(us_holidays.get)
future['is_any_holiday'] = future['us_holiday_name'].notna().astype('int8')
future['all_holiday_name'] = future['us_holiday_name'].fillna('None').astype('category')

X_future = future[['unique_id', 'ds'] + exog_cols]

pred_7d = frct.predict(h=7, X_df=X_future)


pred_col = [c for c in pred_7d.columns if c not in ['unique_id', 'ds']][0]

submission = (
    pred_7d
    .groupby('unique_id', as_index=False)[pred_col]
    .sum()
    .rename(columns={pred_col: 'y'})
)

submission['y'] = submission['y'].clip(lower=0).round().astype('int64')


In [38]:
submission

Unnamed: 0,unique_id,y
0,0_FOODS_1_0,26
1,0_FOODS_1_1,10
2,0_FOODS_1_10,16
3,0_FOODS_1_11,19
4,0_FOODS_1_13,65
...,...,...
400,3_HOUSEHOLD_2_169,5
401,3_HOUSEHOLD_2_171,6
402,3_HOUSEHOLD_2_177,4
403,3_HOUSEHOLD_2_179,7


In [39]:
sub = submission.rename(columns={'unique_id': 'index'})
sub.to_csv(data_dir / "submission.csv", index=False)
print("Saved:", sub.shape, sub.columns.tolist())

Saved: (405, 2) ['index', 'y']


### Prediction by week

In [20]:
week_period = df['ds'].dt.to_period('W-SUN')
df['ds_week'] = week_period.apply(lambda p: p.end_time.normalize())

# # Якщо у тебе вже є is_any_holiday по днях — використовуємо
# if 'is_any_holiday' not in df.columns:
#     us_h = holidays.US(years=[2014, 2015, 2016])
#     df['is_any_holiday'] = df['ds'].dt.date.map(lambda d: 1 if d in us_h else 0).astype('int8')

# Агрегація до тижня
weekly = (
    df.groupby(['unique_id', 'ds_week'], as_index=False)
      .agg(
          y=('y', 'sum'),
          holiday_days=('is_any_holiday', 'sum'),     # скільки святкових днів у тижні
          is_holiday_week=('is_any_holiday', 'max')   # чи було хоч 1 свято в тижні
      )
)

# Тижневі календарні фічі
weekly['day_of_week']  = weekly['ds_week'].dt.dayofweek     # неділя = 6
weekly['month']        = weekly['ds_week'].dt.month
weekly['year']         = weekly['ds_week'].dt.year
weekly['week_of_year'] = weekly['ds_week'].dt.isocalendar().week.astype('int16')

weekly = weekly.sort_values(['unique_id', 'ds_week']).reset_index(drop=True)
weekly['unique_id'] = weekly['unique_id'].astype('category')

weekly.head()

Unnamed: 0,unique_id,ds_week,y,holiday_days,is_holiday_week,day_of_week,month,year,week_of_year
0,0_FOODS_1_0,2014-01-05,159,1,1,6,1,2014,1
1,0_FOODS_1_0,2014-01-12,190,1,1,6,1,2014,2
2,0_FOODS_1_0,2014-01-19,87,0,0,6,1,2014,3
3,0_FOODS_1_0,2014-01-26,77,1,1,6,1,2014,4
4,0_FOODS_1_0,2014-02-02,43,1,1,6,2,2014,5


In [21]:
cutoff_week_end = pd.Timestamp('2016-05-08')  # тренуємось до тижня 02-08
valid_week_end  = pd.Timestamp('2016-05-15')  # валід тиждень 09-15

train_w = weekly[weekly['ds_week'] <= cutoff_week_end].copy()
valid_w = weekly[weekly['ds_week'] == valid_week_end].copy()

train_w['ds_week'].max(), valid_w['ds_week'].unique()

(Timestamp('2016-05-08 00:00:00'),
 <DatetimeArray>
 ['2016-05-15 00:00:00']
 Length: 1, dtype: datetime64[us])

In [40]:
exog_cols = [
    'holiday_days',
    'is_holiday_week',
    'day_of_week',
    'month',
    'year',
    'week_of_year',
]

lgbm_best = LGBMRegressor(
    **best_params,
    random_state=42,
    objective='tweedie',
    tweedie_variance_power=1.1,
    n_jobs=-1,
    verbosity=-1,
)

lags = [1, 2, 4, 8, 12]  # 1/2/4/8/12 тижнів назад
lag_transforms = {
    1: [RollingMean(4), RollingMean(8), RollingStd(4)]
}

frct = MLForecast(
    models=[lgbm],
    lags=lags,
    lag_transforms=lag_transforms,
    freq='W-SUN'   # важливо: тижнева частота, кінець НЕДІЛЯ
)

frct.fit(
    train_w[['unique_id','ds_week','y'] + exog_cols],
    id_col='unique_id',
    time_col='ds_week',
    target_col='y',
    static_features=[]
)

MLForecast(models=[LGBMRegressor], freq=W-SUN, lag_features=['lag1', 'lag2', 'lag4', 'lag8', 'lag12', 'rolling_mean_lag1_window_size4', 'rolling_mean_lag1_window_size8', 'rolling_std_lag1_window_size4'], date_features=[], num_threads=1)

In [41]:
def weekly_mape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    mask = y_true > 0
    if mask.sum() == 0:
        return np.inf
    return (np.abs(y_true[mask] - y_pred[mask]) / np.maximum(np.abs(y_true[mask]), eps)).mean() * 100

# X_valid має мати рівно 1 рядок на unique_id на потрібний тиждень (тут 2016-05-15)
X_valid = valid_w[['unique_id','ds_week'] + exog_cols].copy()

pred = frct.predict(h=1, X_df=X_valid)

pred_col = [c for c in pred.columns if c not in ['unique_id','ds_week']][0]
pred['y_pred'] = pred[pred_col].clip(lower=0).round().astype('int64')

check = valid_w.merge(pred[['unique_id','ds_week','y_pred']], on=['unique_id','ds_week'], how='inner')

mape = weekly_mape(check['y'].values, check['y_pred'].values)
print("Weekly MAPE (%):", mape)
print("rows:", len(check), "series:", check['unique_id'].nunique())

Weekly MAPE (%): 39.50626338119678
rows: 405 series: 405


In [42]:
train_full_w = weekly[weekly['ds_week'] <= pd.Timestamp('2016-05-15')].copy()

frct.fit(
    train_full_w[['unique_id','ds_week','y'] + exog_cols],
    id_col='unique_id',
    time_col='ds_week',
    target_col='y',
    static_features=[]
)

future = frct.make_future_dataframe(h=1)  # дасть ds_week = 2016-05-22 для кожного id

# треба додати exog_cols на цей майбутній тиждень
# holiday_days / is_holiday_week по днях тижня 16-22
us_h = holidays.US(years=[2016])

def holiday_days_in_week(week_end):
    # week_end це неділя; тиждень = попередні 6 днів + неділя
    days = pd.date_range(week_end - pd.Timedelta(days=6), week_end, freq='D')
    names = [us_h.get(d.date()) for d in days]
    cnt = sum(n is not None for n in names)
    return cnt

future['holiday_days'] = future['ds_week'].apply(holiday_days_in_week).astype('int16')
future['is_holiday_week'] = (future['holiday_days'] > 0).astype('int8')

future['day_of_week']  = future['ds_week'].dt.dayofweek
future['month']        = future['ds_week'].dt.month
future['year']         = future['ds_week'].dt.year
future['week_of_year'] = future['ds_week'].dt.isocalendar().week.astype('int16')

X_future = future[['unique_id','ds_week'] + exog_cols].copy()

pred_f = frct.predict(h=1, X_df=X_future)
pred_col = [c for c in pred_f.columns if c not in ['unique_id','ds_week']][0]

submission = pred_f[['unique_id', pred_col]].rename(columns={pred_col: 'y'})
submission['y'] = submission['y'].clip(lower=0).round().astype('int64')

sub = submission.rename(columns={'unique_id': 'index'})
sub.to_csv(data_dir / 'submission_by_week.csv', index=False)
sub.head(), sub.shape

(          index   y
 0   0_FOODS_1_0  26
 1   0_FOODS_1_1  14
 2  0_FOODS_1_10  27
 3  0_FOODS_1_11  22
 4  0_FOODS_1_13  67,
 (405, 2))