In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta, date

from sklearn.model_selection import train_test_split

from pathlib import Path

import holidays
import gc

from mlforecast import MLForecast
from mlforecast.lag_transforms import RollingMean, RollingStd
from lightgbm import LGBMRegressor

# import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Using device:", device)

In [2]:
data_dir = Path("../Test-Task-for-DS-time-series-forecasting-2026-01/data")
prepared = data_dir / "data_prepared.csv"

print("prepared:", prepared.exists())

prepared: True


### Some prepare

In [3]:
df = pd.read_csv(prepared)
print(df.shape, "\n")
df.head()

(350730, 13) 



Unnamed: 0,unique_id,ds,y,is_observed,day_of_week,month,day_of_month,year,week_of_year,is_weekend,holiday_type,is_any_holiday,all_holiday_name
0,0_FOODS_1_0,2014-01-01,23,1,2,1,1,2014,1,0,National,1,NewYear | New Year's Day
1,0_FOODS_1_0,2014-01-02,28,1,3,1,2,2014,1,0,,0,
2,0_FOODS_1_0,2014-01-03,43,1,4,1,3,2014,1,0,,0,
3,0_FOODS_1_0,2014-01-04,33,1,5,1,4,2014,1,1,,0,
4,0_FOODS_1_0,2014-01-05,32,1,6,1,5,2014,1,1,,0,


In [4]:
df['ds'] = pd.to_datetime(df['ds'])
df["all_holiday_name"] = df["all_holiday_name"].fillna("None")
df["holiday_type"] = df["holiday_type"].fillna("None")
df['unique_id'] = df['unique_id'].astype('category')
df['holiday_type'] = df['holiday_type'].astype('category')
df['all_holiday_name'] = df['all_holiday_name'].astype('category')
df[["holiday_type", "all_holiday_name"]].isna().sum()
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 350730 entries, 0 to 350729
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   unique_id         350730 non-null  category      
 1   ds                350730 non-null  datetime64[us]
 2   y                 350730 non-null  int64         
 3   is_observed       350730 non-null  int64         
 4   day_of_week       350730 non-null  int64         
 5   month             350730 non-null  int64         
 6   day_of_month      350730 non-null  int64         
 7   year              350730 non-null  int64         
 8   week_of_year      350730 non-null  int64         
 9   is_weekend        350730 non-null  int64         
 10  holiday_type      350730 non-null  category      
 11  is_any_holiday    350730 non-null  int64         
 12  all_holiday_name  350730 non-null  category      
dtypes: category(3), datetime64[us](1), int64(9)
memory usage: 28.1 MB


## MLForecast + LightGBM

### Validation

In [5]:
cut_off_date = pd.Timestamp('2016-05-08') 
train_df = df[df['ds'] <= cut_off_date].copy()
valid_df = df[(df['ds'] > cut_off_date) & (df['ds'] <= pd.Timestamp('2016-05-15'))].copy()

In [6]:
lgbm = LGBMRegressor(n_estimators=1000, 
                     learning_rate=0.05,
                     num_leaves=64,
                     colsample_bytree=0.8,
                     subsample=0.8,
                     random_state=42
                    )

lags = [1, 7, 14, 28]

lag_transforms = {1: [RollingMean(7),
                RollingMean(28),
                RollingStd(7)
                ]
            }


In [7]:
exog_cols = ['is_observed',
             'day_of_week',
             'month',
             'day_of_month',
             'year',
             'week_of_year',
             'is_weekend',
            #  'holiday_type',
             'is_any_holiday',
             'all_holiday_name',
             ]

In [8]:
X_valid = valid_df.drop(columns=['y'])

frct = MLForecast(
    models=[lgbm],
    lags=lags,
    lag_transforms=lag_transforms,
    freq='D'
)
frct.fit(train_df, 
         id_col='unique_id',
         time_col='ds', 
         target_col='y', 
         static_features=[]
        )

pred_valid = frct.predict(h=7, X_df=X_valid)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1628
[LightGBM] [Info] Number of data points in the train set: 336555, number of used features: 17
[LightGBM] [Info] Start training from score 6.013962


In [9]:
pred_col = [c for c in pred_valid.columns if c not in ['unique_id', 'ds']][0]
check = valid_df.merge(pred_valid, on=['unique_id', 'ds'], how='inner').copy()

check['y_pred'] = check[pred_col].clip(lower=0).round().astype('int64')

weekly = (check
          .groupby('unique_id', as_index=False)
          .agg(y_true=('y', 'sum'),
               y_pred=('y_pred', 'sum')))

eps = 1e-8
weekly['ape'] = (weekly['y_true'] - weekly['y_pred']).abs() / np.maximum(weekly['y_true'].abs(), eps)

mape_weekly = weekly['ape'].mean() * 100
print("Weekly MAPE (%):", mape_weekly)
print("How many series:", weekly['unique_id'].nunique(), "rows:", len(weekly))


Weekly MAPE (%): 26.927613981704486
How many series: 405 rows: 405


### Prediction

In [None]:
exog_cols = [
    # 'is_observed',
    'day_of_week',
    'month',
    'day_of_month',
    'year',
    'week_of_year',
    'is_weekend',
    'is_any_holiday',
    'all_holiday_name',
]

In [58]:
train_full = df[['unique_id', 'ds', 'y'] + exog_cols].copy()

In [59]:
frct = MLForecast(
    models=[lgbm],
    lags=lags,
    lag_transforms=lag_transforms,
    freq='D'
)
frct.fit(
    train_full,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    static_features=[]
)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1592
[LightGBM] [Info] Number of data points in the train set: 339390, number of used features: 14
[LightGBM] [Info] Start training from score 6.018079


MLForecast(models=[LGBMRegressor], freq=D, lag_features=['lag1', 'lag7', 'lag14', 'lag28', 'rolling_mean_lag1_window_size7', 'rolling_mean_lag1_window_size28', 'rolling_std_lag1_window_size7'], date_features=[], num_threads=1)

In [None]:
future = frct.make_future_dataframe(h=7)

# календарні фічі
future['day_of_week']  = future['ds'].dt.dayofweek
future['month']        = future['ds'].dt.month
future['day_of_month'] = future['ds'].dt.day
future['year']         = future['ds'].dt.year
future['week_of_year'] = future['ds'].dt.isocalendar().week.astype('int16')
future['is_weekend']   = future['day_of_week'].isin([5, 6]).astype('int8')

# future['is_observed'] = 0

us_holidays = holidays.US(years=[2016])
future_dates = future['ds'].dt.date
future['us_holiday_name'] = future_dates.map(us_holidays.get)
future['is_any_holiday'] = future['us_holiday_name'].notna().astype('int8')
future['all_holiday_name'] = future['us_holiday_name'].fillna('None').astype('category')

X_future = future[['unique_id', 'ds'] + exog_cols]

pred_7d = frct.predict(h=7, X_df=X_future)


pred_col = [c for c in pred_7d.columns if c not in ['unique_id', 'ds']][0]

submission = (
    pred_7d
    .groupby('unique_id', as_index=False)[pred_col]
    .sum()
    .rename(columns={pred_col: 'y'})
)

submission['y'] = submission['y'].clip(lower=0).round().astype('int64')


In [61]:
submission

Unnamed: 0,unique_id,y
0,0_FOODS_1_0,23
1,0_FOODS_1_1,16
2,0_FOODS_1_10,27
3,0_FOODS_1_11,21
4,0_FOODS_1_13,63
...,...,...
400,3_HOUSEHOLD_2_169,3
401,3_HOUSEHOLD_2_171,5
402,3_HOUSEHOLD_2_177,3
403,3_HOUSEHOLD_2_179,6


In [62]:
sub = submission.rename(columns={'unique_id': 'index'})
sub.to_csv(data_dir / "submission.csv", index=False)
print("Saved:", sub.shape, sub.columns.tolist())

Saved: (405, 2) ['index', 'y']
