In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta, date

from pathlib import Path
import gc

import holidays

from mlforecast import MLForecast
from mlforecast.lag_transforms import RollingMean, RollingStd
from lightgbm import LGBMRegressor

import optuna
from optuna.samplers import TPESampler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = Path("../Test-Task-for-DS-time-series-forecasting-2026-01/data")
prepared = data_dir / "data_prepared.parquet"

print("prepared:", prepared.exists())

prepared: True


In [3]:
df = pd.read_parquet(prepared)
print(df.shape, "\n")
df.head()

(350730, 15) 



Unnamed: 0,unique_id,ds,store_id,cat_id,dept_id,y,day_of_week,month,day_of_month,year,week_of_year,is_weekend,holiday_type,is_any_holiday,all_holiday_name
0,0_FOODS_1_0,2014-01-01,0,FOODS,FOODS_1,23,2,1,1,2014,1,0,National,1,NewYear | New Year's Day
1,0_FOODS_1_0,2014-01-02,0,FOODS,FOODS_1,28,3,1,2,2014,1,0,,0,
2,0_FOODS_1_0,2014-01-03,0,FOODS,FOODS_1,43,4,1,3,2014,1,0,,0,
3,0_FOODS_1_0,2014-01-04,0,FOODS,FOODS_1,33,5,1,4,2014,1,1,,0,
4,0_FOODS_1_0,2014-01-05,0,FOODS,FOODS_1,32,6,1,5,2014,1,1,,0,


In [4]:
df["store_id"] = df["store_id"].astype("category")
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 350730 entries, 0 to 350729
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   unique_id         350730 non-null  category      
 1   ds                350730 non-null  datetime64[us]
 2   store_id          350730 non-null  category      
 3   cat_id            350730 non-null  category      
 4   dept_id           350730 non-null  category      
 5   y                 350730 non-null  int64         
 6   day_of_week       350730 non-null  int32         
 7   month             350730 non-null  int32         
 8   day_of_month      350730 non-null  int32         
 9   year              350730 non-null  int32         
 10  week_of_year      350730 non-null  int16         
 11  is_weekend        350730 non-null  int8          
 12  holiday_type      350730 non-null  category      
 13  is_any_holiday    350730 non-null  int8          
 14  all_holiday_nam

In [5]:
gc.collect()

31

In [6]:
def add_lag_rolling_features(df, id_col: str='unique_id', data: str="ds", y_col: str="y"):
    d = df.copy()

    d = d.sort_values([id_col, data])

    g = d.groupby(id_col, sort=False)[y_col]

    # add lags
    d["lag_1"]  = g.shift(1)
    d["lag_7"]  = g.shift(7)
    d["lag_14"] = g.shift(14)
    d["lag_28"] = g.shift(28)
    # базова серія “тільки минуле” для ролінгів (щоб не потрапив поточний день)
    y_past = g.shift(1)

    d["roll_mean_7"]  = y_past.rolling(7).mean()
    d["roll_mean_14"] = y_past.rolling(14).mean()
    d["roll_mean_28"] = y_past.rolling(28).mean()

    d["roll_std_7"] = y_past.rolling(7).std()

    return d

In [7]:
df = add_lag_rolling_features(df)

In [8]:
lag_cols = ["lag_1","lag_7","lag_14","lag_28","roll_mean_7","roll_mean_14","roll_mean_28","roll_std_7"]

In [9]:
def drop_na_lag_rows(df_part, lag_cols=lag_cols):
    return df_part.dropna(subset=lag_cols).copy()

In [10]:
# folds for train
h=7
fold_train_ends = [
    '2015-05-15',
    '2016-02-15',
    '2016-03-15',
    '2016-04-15'
]

In [11]:
def iter_time_folds(df, data: str="ds", train_end_list=fold_train_ends, pred_horizont: int=7):
    d = df.copy()

    # data_min = d[data].min()
    # data_max = d[data].max()

    for fold, data_end in enumerate(train_end_list):
        train_end = pd.Timestamp(data_end)

        val_start = train_end + pd.Timedelta(days=1)
        val_end = train_end + pd.Timedelta(days=pred_horizont)

        train_mask = d[data] <=train_end
        val_mask = (d[data] >=val_start) & (d[data] <= val_end)

        train_idx = d.index[train_mask].to_numpy()
        val_idx = d.index[val_mask].to_numpy()

        yield {
            "fold": fold,
            "train_end": train_end,
            'val_start': val_start,
            'val_end': val_end,
            'train_idx': train_idx,
            "val_idx": val_idx
        }

In [None]:
def add_lag_rolling_features(df, id_col: str='unique_id', data: str="ds", y_col: str="y"):
    d = df.copy()

    d = d.sort_values([id_col, data])

    g = d.groupby(id_col, sort=False)[y_col]

    # add lags
    d["lag_1"]  = g.shift(1)
    d["lag_7"]  = g.shift(7)
    d["lag_14"] = g.shift(14)
    d["lag_28"] = g.shift(28)
    # базова серія “тільки минуле” для ролінгів (щоб не потрапив поточний день)
    y_past = g.shift(1)

    d["roll_mean_7"]  = y_past.rolling(7).mean()
    d["roll_mean_14"] = y_past.rolling(14).mean()
    d["roll_mean_28"] = y_past.rolling(28).mean()

    d["roll_std_7"] = y_past.rolling(7).std()

    return d

In [12]:
def show_folds(df, folds, data: str = "ds", id_col: str='unique_id'):
    rows = []

    for fold in folds:
        train_idx = fold['train_idx']
        val_idx = fold['val_idx']

        train_rows = len(train_idx)
        val_rows = len(val_idx)

        train_unique = df.loc[train_idx, id_col].nunique()
        val_unique = df.loc[val_idx, id_col].nunique()

        val_days = df.loc[val_idx, data].nunique()

        rows.append({
            "fold": fold['fold'],
            'train_end': fold["train_end"].date(),
            "val_start": fold["val_start"].date(),
            "val_end": fold["val_end"].date(),
            "train_rows": train_rows,
            'val_rows': val_rows,
            'train_unique': train_unique,
            'val_unique': val_unique,
            'val_days': val_days
        })

    return pd.DataFrame(rows).sort_values("train_end").reset_index(drop=True)

In [13]:
folds = list(iter_time_folds(df))
summary_folds = show_folds(df, folds)
summary_folds

Unnamed: 0,fold,train_end,val_start,val_end,train_rows,val_rows,train_unique,val_unique,val_days
0,0,2015-05-15,2015-05-16,2015-05-22,202500,2835,405,405,7
1,1,2016-02-15,2016-02-16,2016-02-22,314280,2835,405,405,7
2,2,2016-03-15,2016-03-16,2016-03-22,326025,2835,405,405,7
3,3,2016-04-15,2016-04-16,2016-04-22,338580,2835,405,405,7


In [14]:
def fit_uid_stats(df, train_idx, id_col: str='unique_id', y_col: str="y"):

    train = df.loc[train_idx, [id_col, y_col]].copy()

    global_mean = train[y_col].mean()

    uid_mean = train.groupby(id_col)[y_col].mean()

    return {
        'global_mean': global_mean,
        'uid_mean': uid_mean
    }



In [15]:
def apply_uid_stats(df, train_idx, stats, id_col: str='unique_id'):
    part = df.loc[train_idx].copy()
    part['uid_y_mean_past'] = part[id_col].map(stats['uid_mean']).fillna(stats['global_mean'])

    return part

In [16]:
fold = folds[0]
stats = fit_uid_stats(df,fold['train_idx'])
train_fold = apply_uid_stats(df, fold['train_idx'], stats)
val_fold = apply_uid_stats(df, fold['val_idx'], stats)

In [17]:
train_fold.info()

<class 'pandas.DataFrame'>
Index: 202500 entries, 0 to 350363
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   unique_id         202500 non-null  category      
 1   ds                202500 non-null  datetime64[us]
 2   store_id          202500 non-null  category      
 3   cat_id            202500 non-null  category      
 4   dept_id           202500 non-null  category      
 5   y                 202500 non-null  int64         
 6   day_of_week       202500 non-null  int32         
 7   month             202500 non-null  int32         
 8   day_of_month      202500 non-null  int32         
 9   year              202500 non-null  int32         
 10  week_of_year      202500 non-null  int16         
 11  is_weekend        202500 non-null  int8          
 12  holiday_type      202500 non-null  category      
 13  is_any_holiday    202500 non-null  int8          
 14  all_holiday_name  20

In [18]:
feature_cols = [
    "day_of_week", "month", "day_of_month", "year", "week_of_year",
    "is_weekend", "is_any_holiday",
    "uid_y_mean_past",
    "store_id", "cat_id", "dept_id", "all_holiday_name",
] + lag_cols

cat_cols = ["store_id", "cat_id", "dept_id", "all_holiday_name"]


In [19]:
def train_one_fold_lgbm(train_fold, val_fold, feature_cols, cat_cols, y_col: str="y", model_params=None):
    if model_params is None:
        model_params = {
            "n_estimators": 1200,
            "learning_rate": 0.05,
            "num_leaves": 64,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "random_state": 42,
            "n_jobs": -1,
        }

    X_train = train_fold[feature_cols]
    y_train = train_fold[y_col]

    X_val = val_fold[feature_cols]
    y_val = val_fold[y_col]

    model=LGBMRegressor(**model_params)

    model.fit(X_train, 
              y_train,
              categorical_feature=cat_cols
            )

    val_pred = model.predict(X_val)

    return model, val_pred, y_val

In [20]:
fold = folds[0]
stats = fit_uid_stats(df, fold["train_idx"])
train_fold = apply_uid_stats(df, fold["train_idx"], stats)
val_fold = apply_uid_stats(df, fold["val_idx"], stats)

train_fold = drop_na_lag_rows(train_fold)
val_fold = drop_na_lag_rows(val_fold)

model, val_pred, y_val = train_one_fold_lgbm(train_fold, val_fold, feature_cols, cat_cols)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2160
[LightGBM] [Info] Number of data points in the train set: 191160, number of used features: 20
[LightGBM] [Info] Start training from score 5.865673


In [21]:
def weekly_rmse(val_fold, val_pred, id_col: str='unique_id', y_col: str="y"):
    tmp = val_fold[[id_col, y_col]].copy()

    tmp["y_pred"] = val_pred

    agg = tmp.groupby(id_col, as_index=False).agg(
        y_true_week=(y_col, "sum"),
        y_pred_week=("y_pred", "sum"),
    )

    rmse = float(np.sqrt(np.mean((agg["y_true_week"] - agg["y_pred_week"]) ** 2)))
    return rmse, agg

In [22]:
rmse, agg_week = weekly_rmse(val_fold, val_pred)
rmse

13.139155740706773

In [23]:
def cv_rmse_over_folds(df, folds, feature_cols, cat_cols):
    fold_scores = []

    for fold in folds:
        stats = fit_uid_stats(df, fold["train_idx"])

        train_fold = apply_uid_stats(df, fold["train_idx"], stats)
        val_fold   = apply_uid_stats(df, fold["val_idx"], stats)

        train_fold = drop_na_lag_rows(train_fold)
        val_fold   = drop_na_lag_rows(val_fold)

        model, val_pred, y_val = train_one_fold_lgbm(train_fold, val_fold, feature_cols, cat_cols)

        rmse, _ = weekly_rmse(val_fold, val_pred)

        fold_scores.append({
            "fold": fold["fold"],
            "train_end": fold["train_end"].date(),
            "rmse": rmse
        })

    mean_rmse = float(np.mean([x["rmse"] for x in fold_scores]))

    return mean_rmse, pd.DataFrame(fold_scores)


In [24]:
mean_rmse, fold_table = cv_rmse_over_folds(df, folds, feature_cols, cat_cols)
mean_rmse
fold_table


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2160
[LightGBM] [Info] Number of data points in the train set: 191160, number of used features: 20
[LightGBM] [Info] Start training from score 5.865673
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2154
[LightGBM] [Info] Number of data points in the train set: 302940, number of used features: 20
[LightGBM] [Info] Start training from score 6.007916
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009866 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

Unnamed: 0,fold,train_end,rmse
0,0,2015-05-15,13.139156
1,1,2016-02-15,9.140966
2,2,2016-03-15,11.264226
3,3,2016-04-15,11.15816


In [26]:
gc.collect()

37

In [29]:
def objective(trial, df, folds, feature_cols, cat_cols):
    model_params = {
        "n_estimators": trial.suggest_int("n_estimators", 600, 4000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 255),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),

        "random_state": 42,
        "n_jobs": -1,
        "verbose": -1

    }

    rmses = []


    for fold in folds:
        stats = fit_uid_stats(df, fold["train_idx"])

        train_fold = apply_uid_stats(df, fold["train_idx"], stats)
        val_fold   = apply_uid_stats(df, fold["val_idx"], stats)

        train_fold = drop_na_lag_rows(train_fold)
        val_fold   = drop_na_lag_rows(val_fold)

        model, val_pred, y_val = train_one_fold_lgbm(
            train_fold, val_fold,
            feature_cols, cat_cols,
            model_params=model_params
        )

        rmse, _ = weekly_rmse(val_fold, val_pred)
        rmses.append(rmse)

        trial.report(float(np.mean(rmses)), step=len(rmses))
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(rmses))


In [31]:
study = optuna.create_study(direction="minimize")
study.optimize(lambda trial: objective(trial, df, folds, feature_cols, cat_cols), n_trials=10)

print("Best RMSE:", study.best_value)
print("Best params:", study.best_params)


[32m[I 2026-01-30 15:21:31,013][0m A new study created in memory with name: no-name-d0f84836-9ef6-46ce-a5f7-0e54d1d29eb8[0m
[32m[I 2026-01-30 15:25:14,835][0m Trial 0 finished with value: 12.189020362974748 and parameters: {'n_estimators': 3326, 'learning_rate': 0.05333845616529666, 'num_leaves': 165, 'min_child_samples': 55, 'subsample': 0.6845880147811638, 'colsample_bytree': 0.6337806692398213, 'reg_alpha': 1.748171049141858e-05, 'reg_lambda': 2.904962197901249e-08}. Best is trial 0 with value: 12.189020362974748.[0m
[32m[I 2026-01-30 15:28:49,772][0m Trial 1 finished with value: 11.171605109928914 and parameters: {'n_estimators': 2249, 'learning_rate': 0.01018812976117726, 'num_leaves': 205, 'min_child_samples': 122, 'subsample': 0.9194129566873828, 'colsample_bytree': 0.9932398006283248, 'reg_alpha': 9.097908535056818e-05, 'reg_lambda': 0.0025507838519316845}. Best is trial 1 with value: 11.171605109928914.[0m
[32m[I 2026-01-30 15:31:38,093][0m Trial 2 finished with val

Best RMSE: 10.925261048568775
Best params: {'n_estimators': 1002, 'learning_rate': 0.028560603821448094, 'num_leaves': 139, 'min_child_samples': 191, 'subsample': 0.9772738131085273, 'colsample_bytree': 0.947078893699306, 'reg_alpha': 0.5381365747659211, 'reg_lambda': 0.11064476419765833}


In [33]:
gc.collect()

0

In [42]:
# 1) cutoff
TRAIN_END = pd.Timestamp("2016-05-15")
PRED_START = pd.Timestamp("2016-05-16")
PRED_END   = pd.Timestamp("2016-05-22")

# 2) фінальні параметри моделі
best_params = {
    "n_estimators": 1002,
    "learning_rate": 0.028560603821448094,
    "num_leaves": 139,
    "min_child_samples": 191,
    "subsample": 0.9772738131085273,
    "colsample_bytree": 0.947078893699306,
    "reg_alpha": 0.5381365747659211,
    "reg_lambda": 0.11064476419765833,
}

model_params = dict(best_params)
model_params.update({
    "random_state": 42,
    "n_jobs": -1,
    "verbose": -1,
    "force_row_wise": True,
})

# 3) списки колонок (як у нас)
LAG_COLS = [
    "lag_1","lag_7","lag_14","lag_28",
    "roll_mean_7","roll_mean_14","roll_mean_28","roll_std_7",
]
cat_cols = ["store_id","cat_id","dept_id","all_holiday_name"]
feature_cols = [
    "day_of_week","month","day_of_month","year","week_of_year",
    "is_weekend","is_any_holiday",
    "uid_y_mean_past",
    "store_id","cat_id","dept_id","all_holiday_name",
] + LAG_COLS

# 4) train-mask і uid stats (на всьому train до cutoff)
train_mask = df["ds"] <= TRAIN_END
train_idx = df.index[train_mask].to_numpy()

stats = fit_uid_stats(df, train_idx)
train_full = apply_uid_stats(df, train_idx, stats)

# 5) прибираємо NaN лагів у train (це ті самі перші 28 днів у 2014)
train_full = drop_na_lag_rows(train_full)

# 6) fit
X_train = train_full[feature_cols]
y_train = train_full["y"]

final_model = LGBMRegressor(**model_params)
final_model.fit(X_train, y_train, categorical_feature=cat_cols)


0,1,2
,boosting_type,'gbdt'
,num_leaves,139
,max_depth,-1
,learning_rate,0.028560603821448094
,n_estimators,1002
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [43]:
cat_cols = ["store_id", "cat_id", "dept_id", "all_holiday_name"]

train_categories = {
    col: train_full[col].cat.categories
    for col in cat_cols
}

In [44]:
def align_categoricals(df_part, train_categories, cat_cols):
    d = df_part.copy()
    for col in cat_cols:
        # робимо саме category і ставимо ті самі categories, що в train
        d[col] = pd.Categorical(d[col], categories=train_categories[col])
    return d


In [46]:
TRAIN_END  = pd.Timestamp("2016-05-15")
PRED_START = pd.Timestamp("2016-05-16")
PRED_END   = pd.Timestamp("2016-05-22")

id_col = "unique_id"
data   = "ds"
y_col  = "y"

lag_cols = ["lag_1","lag_7","lag_14","lag_28","roll_mean_7","roll_mean_14","roll_mean_28","roll_std_7"]

# --- 1) hist: історія до cutoff (для лагів) ---
hist = df.loc[df[data] <= TRAIN_END, [id_col, data, y_col]].copy()
hist = hist.sort_values([id_col, data])

# --- 2) future_base: створюємо 405×7 рядків на 16–22 з “статичних” колонок ---
# Беремо статичні поля (store_id/cat_id/dept_id) з останнього відомого дня для кожного unique_id
static_cols = ["store_id", "cat_id", "dept_id"]
last_static = (
    df.loc[df[data] == TRAIN_END, [id_col] + static_cols]
      .drop_duplicates(subset=[id_col])
)

# Каркас майбутніх дат
future_days = pd.date_range(PRED_START, PRED_END, freq="D")
future_base = last_static.assign(key=1).merge(
    pd.DataFrame({data: future_days, "key": 1}),
    on="key",
    how="outer"
).drop(columns=["key"])

# --- 3) додаємо календарні фічі з дати ---
future_base["day_of_week"]  = future_base[data].dt.dayofweek.astype("int32")
future_base["month"]        = future_base[data].dt.month.astype("int32")
future_base["day_of_month"] = future_base[data].dt.day.astype("int32")
future_base["year"]         = future_base[data].dt.year.astype("int32")
future_base["week_of_year"] = future_base[data].dt.isocalendar().week.astype("int16")
future_base["is_weekend"]   = (future_base["day_of_week"] >= 5).astype("int8")

# --- 4) свята: якщо є окрема таблиця/мапа — підключи тут.
# Мінімальний safe-default:
future_base["is_any_holiday"] = np.int8(0)
future_base["all_holiday_name"] = "None"

# Якщо у тебе є таблиця з holiday по даті (наприклад, holiday_df з колонками ds/is_any_holiday/all_holiday_name) —
# зроби merge тут і ці 2 колонки будуть реальні.

# --- 5) uid_y_mean_past: по stats, порахованих до 15.05 (вони вже у тебе є) ---
future_base["uid_y_mean_past"] = future_base[id_col].map(stats["uid_mean"]).fillna(stats["global_mean"])

# --- 6) ітеративний прогноз 7 днів ---
pred_rows = []

for day in future_days:
    base_day = future_base.loc[future_base[data] == day].copy()

    # combined = hist + “порожній” day, щоб лаги порахувались саме для цього day
    day_stub = base_day[[id_col, data]].copy()
    day_stub[y_col] = np.nan

    combined = pd.concat([hist, day_stub], ignore_index=True)
    combined = add_lag_rolling_features(combined, id_col=id_col, data=data, y_col=y_col)

    lags_day = combined.loc[combined[data] == day, [id_col, data] + lag_cols].copy()
    base_day = base_day.merge(lags_day, on=[id_col, data], how="left")

    # якщо раптом NaN (не повинно бути), прибираємо
    base_day = base_day.dropna(subset=lag_cols)

    base_day = align_categoricals(base_day, train_categories, cat_cols)

    # прогноз
    y_pred = final_model.predict(base_day[feature_cols])
    base_day["y_pred"] = y_pred

    pred_rows.append(base_day[[id_col, data, "y_pred"]])

    # додаємо прогноз у hist як y для наступного дня
    hist = pd.concat(
        [hist, base_day[[id_col, data, "y_pred"]].rename(columns={"y_pred": y_col})],
        ignore_index=True
    )

pred_df = pd.concat(pred_rows, ignore_index=True)

# --- 7) тижнева сума і submission ---
sub = pred_df.groupby(id_col, as_index=False)["y_pred"].sum().rename(columns={"y_pred": "y"})

print(sub.shape)
sub['y'] = sub['y'].clip(lower=0).round().astype('int64')
sub = sub.rename(columns={'unique_id': 'index'})
sub.head()


(405, 2)


Unnamed: 0,index,y
0,0_FOODS_1_0,33
1,0_FOODS_1_1,19
2,0_FOODS_1_10,27
3,0_FOODS_1_11,20
4,0_FOODS_1_13,69


In [47]:
sub = sub.rename(columns={'unique_id': 'index'})
sub.to_csv(data_dir / "submission_new.csv", index=False)
print("Saved:", sub.shape, sub.columns.tolist())

Saved: (405, 2) ['index', 'y']
