In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

In [None]:
DATA_PATH = kagglehub.competition_download('haier-europe-2025-datathon')

print('Data source import complete.')

Data source import complete.


In [None]:
!pip install catboost -qq
# !pip install pytabkit -qq

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
import sklearn
import joblib
import warnings


from catboost import CatBoostClassifier, Pool


pd.set_option('display.max_columns', 2000)
pd.set_option('display.max_rows', 2000)

pl.Config.set_tbl_rows(1000)
pl.Config.set_tbl_cols(1000)

warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd

BASELINE_GROUP_WMAPE = 0.8775865632406377

def rwmape(y_true: np.ndarray, y_pred: np.ndarray, gamma: float = 0.8, lam: float = 0.2, eps: float = 1e-9) -> float:
    """Regularized WMAPE (lower is better)."""
    y_true = y_true.astype(float)
    y_pred = y_pred.astype(float)
    num = np.sum(np.abs(y_true - y_pred)) + lam * np.abs(np.sum(y_true) - np.sum(y_pred))
    den = np.sum(np.abs(y_true)) + gamma * np.sum(np.abs(y_pred)) + eps
    return float(num / den)

def group_wmape(df: pd.DataFrame, group_cols, target_col: str, pred_col: str,
                gamma: float = 0.8, lam: float = 0.2, eps: float = 1e-9) -> float:
    """Mean rWMAPE across groups; skip (sum_y==0 & sum_yhat==0), penalize (sum_y==0 & sum_yhat>0) as 1.0."""
    scores = []
    y = pd.to_numeric(df[target_col], errors="coerce").fillna(0.0).astype(float)
    p = pd.to_numeric(df[pred_col], errors="coerce").fillna(0.0).astype(float)
    work = df.copy()
    work[target_col] = y.values
    work[pred_col] = p.values

    for _, g in work.groupby(list(group_cols), sort=False, dropna=False):
        y_g = g[target_col].to_numpy(dtype=float)
        p_g = g[pred_col].to_numpy(dtype=float)
        sum_true = float(np.sum(np.abs(y_g)))
        sum_pred = float(np.sum(np.abs(p_g)))
        if sum_true == 0.0 and sum_pred == 0.0:
            continue
        if sum_true == 0.0 and sum_pred > 0.0:
            scores.append(1.0)
        else:
            scores.append(rwmape(y_g, p_g, gamma=gamma, lam=lam, eps=eps))
    return float(np.mean(scores)) if scores else 0.0

def score(solution: pd.DataFrame,
          submission: pd.DataFrame,
          target_col: str = "quantity",
          group_cols = ("unique_code",),
          row_id_col: str | None = None,
          baseline_group_wmape: float = BASELINE_GROUP_WMAPE,
          gamma: float = 0.8, lam: float = 0.2, eps: float = 1e-9) -> float:
    """Higher-is-better leaderboard score = baseline / (Group-rWMAPE + eps)."""
    if row_id_col:
        merged = pd.merge(
            solution,
            submission[[row_id_col, target_col]].rename(columns={target_col: f"{target_col}_pred"}),
            on=row_id_col, how="inner", validate="one_to_one"
        )
    else:
        if len(solution) != len(submission):
            raise ValueError("Without row_id_col, solution and submission must have same length.")
        merged = solution.copy()
        merged[f"{target_col}_pred"] = submission[target_col].values

    gw = group_wmape(merged, group_cols=group_cols, target_col=target_col,
                     pred_col=f"{target_col}_pred", gamma=gamma, lam=lam, eps=eps)
    return float(baseline_group_wmape / (gw + eps))


In [None]:
train = pd.read_csv(DATA_PATH + '/train.csv')
product_master = pd.read_csv(DATA_PATH + '/product_master.csv')
sample_submission = pd.read_csv(DATA_PATH + '/submission.csv')

In [None]:
train["date"] = pd.to_datetime(train["date"])
# train['quantity'] = train['quantity'].clip(lower=0)


product_master["start_production_date"] = pd.to_datetime(product_master["start_production_date"], errors="coerce")
product_master["end_production_date"]   = pd.to_datetime(product_master["end_production_date"], errors="coerce")
sample_submission["date"] = pd.to_datetime(sample_submission["date"])

In [None]:
train['unique_code'] = train['market'] + "-" + train['product_code']
train = train.merge(product_master, on='product_code', how='left')
train.head()

Unnamed: 0,market,product_code,date,quantity,unique_code,category,business_line_code,business_line,sector,structure_code,factory,brand,start_production_date,end_production_date
0,MKT_001,PRD_0010,2022-01-01,649,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT
1,MKT_001,PRD_0010,2022-02-01,1964,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT
2,MKT_001,PRD_0010,2022-03-01,1505,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT
3,MKT_001,PRD_0010,2022-04-01,1602,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT
4,MKT_001,PRD_0010,2022-05-01,1816,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT


## Interpolation filling

In [None]:
import pandas as pd

print(train.shape)

import pandas as pd
import numpy as np

def make_dense_timeseries(
    df: pd.DataFrame,
    id_cols,
    date_col: str = "date",
    target_col: str = "quantity",
    freq: str = "MS",
    fill_strategy: str = "zero",
) -> pd.DataFrame:
    """
    Create dense monthly time series per unique series ID.

    fill_strategy:
        - 'zero' → fill missing months with 0.0
        - 'ffill' → forward-fill missing months
        - 'bfill' → backward-fill missing months
        - 'interpolate' → linear interpolation
        - 'interpolate_then_ffill' → interpolate gaps, then ffill edges
        - 'interpolate_then_zero' → interpolate gaps, edge NaNs = 0
    """

    if isinstance(id_cols, str):
        id_cols = [id_cols]

    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(id_cols + [date_col])

    dense_parts = []

    static_cols = [c for c in df.columns if c not in [date_col, target_col]]

    for keys, grp in df.groupby(id_cols, sort=False):
        if not isinstance(keys, tuple):
            keys = (keys,)

        first_date = grp[date_col].min()
        last_date  = grp[date_col].max()

        full_dates = pd.date_range(first_date, last_date, freq=freq)
        tmp = pd.DataFrame({date_col: full_dates})

        for col in static_cols:
            tmp[col] = grp.iloc[0][col]

        for col, key_val in zip(id_cols, keys):
            tmp[col] = key_val

        tmp = tmp.merge(
            grp[[date_col, target_col]],
            on=date_col,
            how="left"
        )

        if fill_strategy == "zero":
            tmp[target_col] = tmp[target_col].fillna(0.0)

        elif fill_strategy == "ffill":
            tmp[target_col] = tmp[target_col].ffill().fillna(0.0)

        elif fill_strategy == "bfill":
            tmp[target_col] = tmp[target_col].bfill().fillna(0.0)

        elif fill_strategy == "interpolate":
            tmp[target_col] = (
                tmp[target_col]
                .interpolate(method="linear")
                .fillna(0.0)
            )

        elif fill_strategy == "interpolate_then_ffill":
            tmp[target_col] = (
                tmp[target_col]
                .interpolate(method="linear")
                .ffill()
                .fillna(0.0)
            )

        elif fill_strategy == "interpolate_then_zero":
            tmp[target_col] = (
                tmp[target_col]
                .interpolate(method="linear")
                .fillna(0.0)
            )

        else:
            raise ValueError(f"Unknown fill_strategy: {fill_strategy}")

        dense_parts.append(tmp)

    dense_df = pd.concat(dense_parts,axis=0).sort_values(id_cols+[date_col]).reset_index(drop=True)
    return dense_df


train = make_dense_timeseries(
    df=train,
    id_cols="unique_code",
    date_col="date",
    target_col="quantity",
    freq="MS",
    fill_strategy="interpolate"
)


print(train.shape)

(278901, 14)
(389205, 14)


## Zero filling

In [None]:
# print(train.shape)

# def make_dense_timeseries(
#     df: pd.DataFrame,
#     id_cols,
#     date_col: str = "date",
#     target_col: str = "quantity",
#     freq: str = "MS",
#     fill_strategy: str = "zero",
#     extend_to_global_end: bool = True
# ) -> pd.DataFrame:

#     if isinstance(id_cols, str):
#         id_cols = [id_cols]

#     df = df.copy()
#     df[date_col] = pd.to_datetime(df[date_col])

#     global_max_date = df[date_col].max()

#     dense_parts = []

#     static_cols = [c for c in df.columns if c not in [date_col, target_col]]

#     for keys, grp in df.groupby(id_cols, sort=False):
#         if not isinstance(keys, tuple):
#             keys = (keys,)

#         first_date = grp[date_col].min()

#         if extend_to_global_end:
#             last_date = global_max_date
#         else:
#             last_date = grp[date_col].max()

#         full_dates = pd.date_range(first_date, last_date, freq=freq)
#         tmp = pd.DataFrame({date_col: full_dates})

#         for col in static_cols:
#             tmp[col] = grp.iloc[0][col]

#         for col, key_val in zip(id_cols, keys):
#             tmp[col] = key_val

#         tmp = tmp.merge(
#             grp[[date_col, target_col]],
#             on=date_col,
#             how="left"
#         )

#         if fill_strategy == "zero":
#             tmp[target_col] = tmp[target_col].fillna(0.0)

#         dense_parts.append(tmp)

#     dense_df = pd.concat(dense_parts, axis=0).sort_values(id_cols+[date_col]).reset_index(drop=True)
#     return dense_df

# # train = make_dense_timeseries(
# #     df=train,
# #     id_cols="unique_code",
# #     date_col="date",
# #     target_col="quantity",
# #     freq="MS",
# #     fill_strategy="zero",
# #     extend_to_global_end=True
# # )

# # print(train.shape)

In [None]:
def add_time_and_lifecycle_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["start_year"] = df["start_production_date"].dt.year
    df["start_month"] = df["start_production_date"].dt.month
    # df["start_quarter"] = df["start_production_date"].dt.quarter

    df["end_year"] = df["end_production_date"].dt.year
    df["end_month"] = df["end_production_date"].dt.month
    # df["end_quarter"] = df["end_production_date"].dt.quarter


    diff_start = df["date"] - df["start_production_date"]
    diff_end   = df["date"] - df["end_production_date"]

    df["months_since_startprod"] = diff_start.dt.days / 30.0
    df["months_since_endprod"]   = diff_end.dt.days / 30.0

    # # df["has_start_date"] = df["start_production_date"].notna().astype(int)
    df["has_end_date"]   = df["end_production_date"].notna()

    # train["months_since_endprod"] = (train["date"] - train["end_production_date"]).dt.days / 30
    df["is_post_endprod"] = (df["date"] > df["end_production_date"])
    df["years_since_endprod"] = df["months_since_endprod"] / 12

    # df['structure_id'] = df['category'].astype(str) + "_" + df['structure_code'].astype(str)

    # df = df.drop("structure_code", axis='columns')


    # buffer_days = 730
    # hard_stop_date = df["end_production_date"] + pd.Timedelta(days=buffer_days)
    # df["is_active"] = 1 # Default to active
    # mask_inactive = (df["end_production_date"].notna()) & (df["date"] > hard_stop_date)
    # df.loc[mask_inactive, "is_active"] = 0
    # df["is_active"] = df["is_active"].astype('object')

    # total_lifespan_days = df["end_production_date"] - df["start_production_date"]
    # df["total_production_lifespan"] = total_lifespan_days.dt.days / 30.0
    # df["lifecycle_progress"] = df["months_since_startprod"] / df["total_production_lifespan"]


    return df

train = add_time_and_lifecycle_features(train)
# sample_submission = add_time_and_lifecycle_features(sample_submission)
train.head()

Unnamed: 0,date,market,product_code,unique_code,category,business_line_code,business_line,sector,structure_code,factory,brand,start_production_date,end_production_date,quantity,start_year,start_month,end_year,end_month,months_since_startprod,months_since_endprod,has_end_date,is_post_endprod,years_since_endprod
0,2022-01-01,MKT_001,PRD_0010,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT,649.0,2017.0,7.0,,,54.233333,,False,False,
1,2022-02-01,MKT_001,PRD_0010,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT,1964.0,2017.0,7.0,,,55.266667,,False,False,
2,2022-03-01,MKT_001,PRD_0010,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT,1505.0,2017.0,7.0,,,56.2,,False,False,
3,2022-04-01,MKT_001,PRD_0010,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT,1602.0,2017.0,7.0,,,57.233333,,False,False,
4,2022-05-01,MKT_001,PRD_0010,MKT_001-PRD_0010,CAT_12,BLC_03,BL_07,SECTOR_02,STR_19,FACTORY_01,BRAND_03,2017-07-19,NaT,1816.0,2017.0,7.0,,,58.233333,,False,False,


In [None]:
def build_cv_folds(df: pd.DataFrame, horizon_months: int = 12):
    """
    Creates time-based folds:
    Fold k: train < val_start_k, validate in [val_start_k, val_start_k + horizon)
    """
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])
    folds = []

    val_starts = [
        # pd.Timestamp("2023-01-01"),
        # pd.Timestamp("2023-05-01"),
        # pd.Timestamp("2023-09-01"),
        pd.Timestamp("2023-11-01"),
        # pd.Timestamp("2024-05-01"),
    ]

    for vs in val_starts:
        ve = vs + pd.DateOffset(months=horizon_months)
        tr_idx = df[df["date"] < vs].index
        va_idx = df[(df["date"] >= vs) & (df["date"] < ve)].index
        if len(tr_idx) > 0 and len(va_idx) > 0:
            folds.append((tr_idx, va_idx))
    return folds

folds = build_cv_folds(train, horizon_months=12)

In [None]:
target_col = "quantity"
categorical_cols = [col for col in train.columns if train[col].dtype == 'object' and col not in ['unique_code']]

for col in categorical_cols:
    train[col] = train[col].fillna("NAN")
    train[col] = train[col].astype('category')

drop_cols = [
    "date",
    "quantity",
    "start_production_date",
    "end_production_date",
    'unique_code',
]

feature_cols = [c for c in train.columns if c not in drop_cols]

In [None]:
from catboost import CatBoostRegressor, Pool

## VOTING REGRESSOR

In [None]:
from operator import add
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

fold_scores = []
oof_list = []

for fold_id, (tr_idx, va_idx) in enumerate(folds, 1):
    print(f"\n===== Fold {fold_id} =====")

    train_df = train.loc[tr_idx].reset_index(drop=True)
    val_df   = train.loc[va_idx].reset_index(drop=True)

    train_df = make_dense_timeseries(
        df=train_df,
        id_cols="unique_code",
        date_col="date",
        target_col="quantity",
        freq="MS",
        fill_strategy="interpolate"
    )

    train_df = add_time_and_lifecycle_features(train_df)
    val_df = add_time_and_lifecycle_features(val_df)

    target_col = "quantity"
    categorical_cols = [col for col in train_df.columns if train_df[col].dtype == 'object' and col not in ['unique_code']]
    cat_indices = [train_df[feature_cols].columns.get_loc(c) for c in categorical_cols]

    for col in categorical_cols:
        train_df[col] = train_df[col].fillna("NAN")
        train_df[col] = train_df[col].astype('category')
        val_df[col] = val_df[col].fillna("NAN")
        val_df[col] = val_df[col].astype('category')

    drop_cols = [
        "date",
        "quantity",
        "start_production_date",
        "end_production_date",
        'unique_code',
    ]

    feature_cols = [c for c in train_df.columns if c not in drop_cols]

    X_tr = train_df[feature_cols]
    y_tr = train_df[target_col]
    X_va = val_df[feature_cols]
    y_va = val_df[target_col]

    print(f"Training from dates: {train_df.date.min()} to {train_df.date.max()}")
    print(f"Validating on dates: {val_df.date.min()} to {val_df.date.max()}")

    cb_tweedie = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.01,
        depth=6,
        loss_function="Tweedie:variance_power=1.1",
        random_seed=42,
        verbose=100,
        # allow_writing_files=False,
        cat_features=cat_indices,
        task_type="CPU"
    )

    cb_mae = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.01,
        depth=6,
        loss_function="MAE",
        random_seed=42,
        verbose=100,
        # allow_writing_files=False,
        cat_features=cat_indices,
        task_type="CPU"
    )

    cb_mae2 = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.01,
        depth=6,
        loss_function="MAE",
        random_seed=1,
        verbose=100,
        # allow_writing_files=False,
        cat_features=cat_indices,
        task_type="CPU"
    )

    lgb_tweedie = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=6,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=20,
        boosting_type= "gbdt",
        objective="tweedie",
        tweedie_variance_power=1.1,
        metric="mae",
        random_state=42,
        verbose=-1,
        # allow_writing_files=False,
        # cat_features=cat_indices,
        # task_type="CPU"
    )


    lgb_mae = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=6,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=20,
        boosting_type= "gbdt",
        objective="mae",
        metric="mae",
        random_state=42,
        verbose=-1,
        # allow_writing_files=False,
        # cat_features=cat_indices,
        # task_type="CPU"
    )

    # xgb_tweedie = XGBRegressor(
    #     n_estimators=1000,
    #     learning_rate=0.01,
    #     max_depth=6,
    #     feature_fraction=0.8,
    #     bagging_fraction=0.8,
    #     boosting_type= "gbdt",
    #     objective="reg:absoluteerror",
    #     # tweedie_variance_power=1.1,
    #     metric="mae",
    #     random_state=42,
    #     verbose=0,
    #     enable_categorical=True,
    # )

    voting_model = VotingRegressor(
        estimators=[
            # ('lgb_tweedie', lgb_tweedie),
            # ('xgb_tweedie', xgb_tweedie),
            # ('lgb_mae', lgb_mae),
            # ('cb_tweedie', cb_tweedie),
            ('cb_mae', cb_mae),
            # ('cb_mae2', cb_mae2),
        ],
        # weights=[1, 1, 1],
        n_jobs=1
    )

    # ---------------------------------------------------------
    # TRAIN & PREDICT
    # ---------------------------------------------------------
    print("Fitting VotingRegressor...")
    voting_model.fit(X_tr, y_tr)

    print("Predicting...")
    val_pred = voting_model.predict(X_va)

    val_pred = np.clip(val_pred, 0, None)
    # val_pred[val_pred < 0.01] = 0

    val_df = val_df.copy()
    val_df["quantity_pred"] = val_pred

    gw = group_wmape(val_df,
                     group_cols=("unique_code",),
                     target_col="quantity",
                     pred_col="quantity_pred")
    lb_score = BASELINE_GROUP_WMAPE / (gw + 1e-9)

    fold_scores.append(lb_score)
    print(f"Fold {fold_id} Voting Score: {lb_score:.6f}")

    oof_list.append(val_df[["unique_code", "date", "quantity", "quantity_pred"]])

print("\nCV fold scores:", fold_scores)
print("Mean CV score:", float(np.mean(fold_scores)))

# ============================================================
# GLOBAL OOF SCORE
# ============================================================
oof_df = pd.concat(oof_list, axis=0).reset_index(drop=True)

oof_solution = oof_df[["unique_code", "date", "quantity"]].copy()
oof_submission = oof_df[["unique_code", "date"]].copy()
oof_submission["quantity"] = oof_df["quantity_pred"].values

oof_lb_score = score(
    solution=oof_solution,
    submission=oof_submission,
    target_col="quantity",
    group_cols=("unique_code",),
    row_id_col=None,
    baseline_group_wmape=BASELINE_GROUP_WMAPE,
)

print(f"\n*** GLOBAL OOF VOTING SCORE: {oof_lb_score:.6f} ***")


===== Fold 1 =====
Training from dates: 2022-01-01 00:00:00 to 2023-10-01 00:00:00
Validating on dates: 2023-11-01 00:00:00 to 2024-10-01 00:00:00
Fitting VotingRegressor...
0:	learn: 114.6473619	total: 86ms	remaining: 1m 25s
100:	learn: 104.7282386	total: 6.96s	remaining: 1m 1s
200:	learn: 101.9926350	total: 14.3s	remaining: 56.9s
300:	learn: 100.8398517	total: 23.1s	remaining: 53.7s
400:	learn: 99.9738238	total: 32.6s	remaining: 48.8s
500:	learn: 99.2274912	total: 42.1s	remaining: 41.9s
600:	learn: 98.7450015	total: 51.3s	remaining: 34s
700:	learn: 98.2643587	total: 1m	remaining: 25.7s
800:	learn: 97.8592567	total: 1m 9s	remaining: 17.2s
900:	learn: 97.5058056	total: 1m 18s	remaining: 8.59s
999:	learn: 97.2499007	total: 1m 26s	remaining: 0us
Predicting...
Fold 1 Voting Score: 1.262672

CV fold scores: [1.262671591635941]
Mean CV score: 1.262671591635941

*** GLOBAL OOF VOTING SCORE: 1.262672 ***


## FULL FIT

In [None]:
from operator import add
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

full_X = train[feature_cols]
full_y = train[target_col]

cat_indices = [full_X[feature_cols].columns.get_loc(c) for c in categorical_cols]

cb_tweedie = CatBoostRegressor(
    iterations=1300,
    learning_rate=0.01,
    depth=6,
    loss_function="Tweedie:variance_power=1.1",
    random_seed=42,
    verbose=100,
    # allow_writing_files=False,
    cat_features=cat_indices,
    task_type="CPU"
)

cb_mae = CatBoostRegressor(
    iterations=1300,
    learning_rate=0.01,
    depth=6,
    loss_function="MAE",
    random_seed=42,
    verbose=100,
    # allow_writing_files=False,
    cat_features=cat_indices,
    task_type="CPU"
)

cb_mae2 = CatBoostRegressor(
    iterations=1300,
    learning_rate=0.01,
    depth=6,
    loss_function="MAE",
    random_seed=1,
    verbose=100,
    # allow_writing_files=False,
    cat_features=cat_indices,
    task_type="CPU"
)

lgb_tweedie = LGBMRegressor(
    n_estimators=1300,
    learning_rate=0.01,
    max_depth=6,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=20,
    boosting_type= "gbdt",
    objective="tweedie",
    tweedie_variance_power=1.1,
    metric="mae",
    random_state=42,
    verbose=-1,
    # allow_writing_files=False,
    # cat_features=cat_indices,
    # task_type="CPU"
)


lgb_mae = LGBMRegressor(
    n_estimators=1300,
    learning_rate=0.01,
    max_depth=6,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=20,
    boosting_type= "gbdt",
    objective="mae",
    metric="mae",
    random_state=42,
    verbose=-1,
    # allow_writing_files=False,
    # cat_features=cat_indices,
    # task_type="CPU"
)

final_voting_model = VotingRegressor(
        estimators=[
            # ('lgb_tweedie', lgb_tweedie),
            # ('xgb_tweedie', xgb_tweedie),
            ('lgb_mae', lgb_mae),
            ('cb_tweedie', cb_tweedie),
            ('cb_mae', cb_mae),
            ('cb_mae2', cb_mae2),
        ],
        # weights=[1, 1, 1],
        n_jobs=1
)


final_voting_model.fit(full_X, full_y)

0:	learn: 1070.7189185	total: 270ms	remaining: 5m 50s
100:	learn: 742.1689409	total: 14.6s	remaining: 2m 53s
200:	learn: 712.5779074	total: 27s	remaining: 2m 27s
300:	learn: 706.1361979	total: 39.5s	remaining: 2m 11s
400:	learn: 703.1183384	total: 55s	remaining: 2m 3s
500:	learn: 701.3899423	total: 1m 11s	remaining: 1m 53s
600:	learn: 700.0676436	total: 1m 27s	remaining: 1m 41s
700:	learn: 698.9609877	total: 1m 43s	remaining: 1m 28s
800:	learn: 697.9938553	total: 2m	remaining: 1m 15s
900:	learn: 7080.4761857	total: 2m 18s	remaining: 1m 1s
1000:	learn: 3049.5810293	total: 2m 35s	remaining: 46.6s
1100:	learn: 1568.3429382	total: 2m 54s	remaining: 31.5s
1200:	learn: 1023.2386969	total: 3m 11s	remaining: 15.8s
1299:	learn: 821.2915724	total: 3m 30s	remaining: 0us
0:	learn: 105.5807084	total: 204ms	remaining: 4m 25s
100:	learn: 97.3392403	total: 18.4s	remaining: 3m 37s
200:	learn: 94.7067840	total: 34.2s	remaining: 3m 6s
300:	learn: 93.4971629	total: 53.4s	remaining: 2m 57s
400:	learn: 92.6

In [None]:
sub_pred = sample_submission.copy()
sub_pred["market_anon"] = sub_pred["unique_code"].str.split("-", expand=True)[0]
sub_pred["item_anon"] = sub_pred["unique_code"].str.split("-", expand=True)[1]

# ---------- SKU-level rows: MKT_xxx-PRD_yyyy ----------
sku_mask = sub_pred["item_anon"].str.startswith("PRD_")
sub_sku = sub_pred[sku_mask].copy()

sub_sku["market"] = sub_sku["market_anon"]
sub_sku["product_code"] = sub_sku["item_anon"]
sub_sku["unique_code"] = sub_sku["market"] + "-" + sub_sku["product_code"]

sub_sku = sub_sku.merge(product_master, on="product_code", how="left")
sub_sku = add_time_and_lifecycle_features(sub_sku)

for col in categorical_cols:
    sub_sku[col] = sub_sku[col].fillna("NAN")
    sub_sku[col] = sub_sku[col].astype('category')


X_test_sku = sub_sku[feature_cols]

preds_sku = final_voting_model.predict(X_test_sku)
preds_sku = np.clip(preds_sku, 0, None)
sub_sku["quantity_pred"] = preds_sku

# ---------- aggregate cat rows ----------

prod_to_cat = product_master.set_index("product_code")["category"]
sub_sku["category"] = sub_sku["product_code"].map(prod_to_cat)

cat_forecast = (
    sub_sku
    .groupby(["market", "category", "date"], as_index=False)["quantity_pred"]
    .sum()
)

cat_mask = sub_pred["item_anon"].str.startswith("CAT_")
sub_cat = sub_pred[cat_mask].copy()
sub_cat["market"] = sub_cat["market_anon"]
sub_cat["category"] = sub_cat["item_anon"]

sub_cat = sub_cat.merge(
    cat_forecast,
    on=["market", "category", "date"],
    how="left"
)
sub_cat["quantity_pred"] = sub_cat["quantity_pred"].fillna(0)

# ==========================================
# 6. FINAL SUBMISSION
# ==========================================
sub_all = sub_pred.copy()
sub_all.loc[sku_mask, "quantity"] = sub_sku["quantity_pred"].values
sub_all.loc[cat_mask, "quantity"] = sub_cat["quantity_pred"].values

final_submission = sub_all[["ID", "unique_code", "date", "quantity"]].copy()
final_submission["quantity"] = final_submission["quantity"].clip(lower=0)

final_submission.to_csv("submission_voting_regressor.csv", index=False)
print("Saved fixed submission.")

Saved fixed submission.
