In [None]:
!pip install autogluon.timeseries -qq

In [None]:
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
import sklearn
import joblib
import warnings


from catboost import CatBoostClassifier, Pool


pd.set_option('display.max_columns', 2000)
pd.set_option('display.max_rows', 2000)

pl.Config.set_tbl_rows(1000)
pl.Config.set_tbl_cols(1000)

warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd

BASELINE_GROUP_WMAPE = 0.8775865632406377

def rwmape(y_true: np.ndarray, y_pred: np.ndarray, gamma: float = 0.8, lam: float = 0.2, eps: float = 1e-9) -> float:
    """Regularized WMAPE (lower is better)."""
    y_true = y_true.astype(float)
    y_pred = y_pred.astype(float)
    num = np.sum(np.abs(y_true - y_pred)) + lam * np.abs(np.sum(y_true) - np.sum(y_pred))
    den = np.sum(np.abs(y_true)) + gamma * np.sum(np.abs(y_pred)) + eps
    return float(num / den)

def group_wmape(df: pd.DataFrame, group_cols, target_col: str, pred_col: str,
                gamma: float = 0.8, lam: float = 0.2, eps: float = 1e-9) -> float:
    """Mean rWMAPE across groups; skip (sum_y==0 & sum_yhat==0), penalize (sum_y==0 & sum_yhat>0) as 1.0."""
    scores = []
    y = pd.to_numeric(df[target_col], errors="coerce").fillna(0.0).astype(float)
    p = pd.to_numeric(df[pred_col], errors="coerce").fillna(0.0).astype(float)
    work = df.copy()
    work[target_col] = y.values
    work[pred_col] = p.values

    for _, g in work.groupby(list(group_cols), sort=False, dropna=False):
        y_g = g[target_col].to_numpy(dtype=float)
        p_g = g[pred_col].to_numpy(dtype=float)
        sum_true = float(np.sum(np.abs(y_g)))
        sum_pred = float(np.sum(np.abs(p_g)))
        if sum_true == 0.0 and sum_pred == 0.0:
            continue
        if sum_true == 0.0 and sum_pred > 0.0:
            scores.append(1.0)
        else:
            scores.append(rwmape(y_g, p_g, gamma=gamma, lam=lam, eps=eps))
    return float(np.mean(scores)) if scores else 0.0

def score(solution: pd.DataFrame,
          submission: pd.DataFrame,
          target_col: str = "quantity",
          group_cols = ("unique_code",),
          row_id_col: str | None = None,
          baseline_group_wmape: float = BASELINE_GROUP_WMAPE,
          gamma: float = 0.8, lam: float = 0.2, eps: float = 1e-9) -> float:
    """Higher-is-better leaderboard score = baseline / (Group-rWMAPE + eps)."""
    if row_id_col:
        merged = pd.merge(
            solution,
            submission[[row_id_col, target_col]].rename(columns={target_col: f"{target_col}_pred"}),
            on=row_id_col, how="inner", validate="one_to_one"
        )
    else:
        if len(solution) != len(submission):
            raise ValueError("Without row_id_col, solution and submission must have same length.")
        merged = solution.copy()
        merged[f"{target_col}_pred"] = submission[target_col].values

    gw = group_wmape(merged, group_cols=group_cols, target_col=target_col,
                     pred_col=f"{target_col}_pred", gamma=gamma, lam=lam, eps=eps)
    return float(baseline_group_wmape / (gw + eps))


In [None]:
train = pd.read_csv('/kaggle/input/haier-europe-2025-datathon/train.csv')
product_master = pd.read_csv('/kaggle/input/haier-europe-2025-datathon/product_master.csv')
sample_submission = pd.read_csv('/kaggle/input/haier-europe-2025-datathon/submission.csv')

In [None]:
train.quantity.min()

In [None]:
train["date"] = pd.to_datetime(train["date"])
# train['quantity'] = train['quantity'].clip(lower=0)

product_master["start_production_date"] = pd.to_datetime(product_master["start_production_date"], errors="coerce")
product_master["end_production_date"]   = pd.to_datetime(product_master["end_production_date"], errors="coerce")
sample_submission["date"] = pd.to_datetime(sample_submission["date"])

train['unique_code'] = train['market'] + "-" + train['product_code']
train = train.merge(product_master, on='product_code', how='left')


print(train.quantity.min())
print(train.head())

In [None]:
train.info()

In [None]:
train.category.value_counts()

In [None]:
aa = train[train.unique_code == "MKT_028-PRD_6151"]
aa

In [None]:
aa = train[train.unique_code == "MKT_025-PRD_11517"][['date', 'quantity']]
aa

In [None]:
aa = train[train.unique_code == "MKT_001-PRD_0075"][['date', 'quantity']]
aa

In [None]:
uc_single = (
    train.groupby("unique_code")["quantity"]
         .nunique()
         .reset_index()
         .rename(columns={"quantity": "nunique_qty"})
)

uc_single = uc_single[uc_single["nunique_qty"] == 1]["unique_code"]

aa = train[train["unique_code"].isin(uc_single)][["unique_code", "date", "quantity"]]
counts = (
    aa.groupby("unique_code")["date"]
      .size()
      .reset_index(name="n_rows")
)

uc_more_than_one = counts[counts["n_rows"] > 7]["unique_code"]

aa_multi = aa[aa["unique_code"].isin(uc_more_than_one)]
len(aa_multi)

In [None]:
aa_multi.head(100)

In [None]:
last_sales = train[train['quantity'] > 0].groupby('product_code')['date'].max().reset_index()
last_sales.rename(columns={'date': 'last_sale_date'}, inplace=True)
lifecycle = pd.merge(last_sales, product_master[['product_code', 'end_production_date']], on='product_code')
lifecycle['sell_through_days'] = (lifecycle['last_sale_date'] - lifecycle['end_production_date']).dt.days
print(lifecycle['sell_through_days'].describe())

In [None]:
stats = lifecycle['sell_through_days'].describe(percentiles=[0.75, 0.80, 0.90, 0.95, 0.99])
print(stats)

In [None]:
aa = train[train.end_production_date=="1998-04-30"]
aa

In [None]:
import pandas as pd

print(train.shape)

import pandas as pd
import numpy as np

def make_dense_timeseries(
    df: pd.DataFrame,
    id_cols,
    date_col: str = "date",
    target_col: str = "quantity",
    freq: str = "MS",
    fill_strategy: str = "zero",  
) -> pd.DataFrame:
    """
    Create dense monthly time series per unique series ID.

    fill_strategy:
        - 'zero' → fill missing months with 0.0
        - 'ffill' → forward-fill missing months
        - 'bfill' → backward-fill missing months
        - 'interpolate' → linear interpolation
        - 'interpolate_then_ffill' → interpolate gaps, then ffill edges
        - 'interpolate_then_zero' → interpolate gaps, edge NaNs = 0
    """

    if isinstance(id_cols, str):
        id_cols = [id_cols]

    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(id_cols + [date_col])

    dense_parts = []

    static_cols = [c for c in df.columns if c not in [date_col, target_col]]

    for keys, grp in df.groupby(id_cols, sort=False):
        if not isinstance(keys, tuple):
            keys = (keys,)

        first_date = grp[date_col].min()
        last_date  = grp[date_col].max()

        full_dates = pd.date_range(first_date, last_date, freq=freq)
        tmp = pd.DataFrame({date_col: full_dates})

        for col in static_cols:
            tmp[col] = grp.iloc[0][col]

        for col, key_val in zip(id_cols, keys):
            tmp[col] = key_val

        tmp = tmp.merge(
            grp[[date_col, target_col]],
            on=date_col,
            how="left"
        )

        if fill_strategy == "zero":
            tmp[target_col] = tmp[target_col].fillna(0.0)

        elif fill_strategy == "ffill":
            tmp[target_col] = tmp[target_col].ffill().fillna(0.0)

        elif fill_strategy == "bfill":
            tmp[target_col] = tmp[target_col].bfill().fillna(0.0)

        elif fill_strategy == "interpolate":
            tmp[target_col] = (
                tmp[target_col]
                .interpolate(method="linear")
                .fillna(0.0)
            )

        elif fill_strategy == "interpolate_then_ffill":
            tmp[target_col] = (
                tmp[target_col]
                .interpolate(method="linear")  
                .ffill()                       
                .fillna(0.0)                   
            )

        elif fill_strategy == "interpolate_then_zero":
            tmp[target_col] = (
                tmp[target_col]
                .interpolate(method="linear")
                .fillna(0.0)
            )

        else:
            raise ValueError(f"Unknown fill_strategy: {fill_strategy}")

        dense_parts.append(tmp)

    dense_df = pd.concat(dense_parts,axis=0).sort_values(id_cols+[date_col]).reset_index(drop=True)
    return dense_df


train = make_dense_timeseries(
    df=train,
    id_cols="unique_code",
    date_col="date",
    target_col="quantity",
    freq="MS",
    fill_strategy="interpolate"
)


print(train.shape)

In [None]:
zz = train[train.unique_code == "MKT_008-PRD_3681"]
zz

In [None]:
def add_time_and_lifecycle_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["start_year"] = df["start_production_date"].dt.year
    df["start_month"] = df["start_production_date"].dt.month
    # df["start_quarter"] = df["start_production_date"].dt.quarter

    df["end_year"] = df["end_production_date"].dt.year
    df["end_month"] = df["end_production_date"].dt.month
    # df["end_quarter"] = df["end_production_date"].dt.quarter

    # df["start_year"] = df["start_year"].astype('object')
    df["start_month"] = df["start_month"].astype("object")
    # df["end_year"] = df["end_year"].astype('object')
    df["end_month"] = df["end_month"].astype("object")

    
    diff_start = df["date"] - df["start_production_date"]
    diff_end   = df["date"] - df["end_production_date"]

    df["months_since_startprod"] = diff_start.dt.days / 30.0
    df["months_since_endprod"]   = diff_end.dt.days / 30.0

    # # df["has_start_date"] = df["start_production_date"].notna().astype(int)
    df["has_end_date"]   = df["end_production_date"].notna().astype('object')

    # train["months_since_endprod"] = (train["date"] - train["end_production_date"]).dt.days / 30
    df["is_post_endprod"] = (df["date"] > df["end_production_date"]).astype('object')
    df["years_since_endprod"] = df["months_since_endprod"] / 12

    # df['structure_id'] = df['category'].astype(str) + "_" + df['structure_code'].astype(str)

    # df = df.drop("structure_code", axis='columns')


    # buffer_days = 730
    # hard_stop_date = df["end_production_date"] + pd.Timedelta(days=buffer_days)
    # df["is_active"] = 1 # Default to active
    # mask_inactive = (df["end_production_date"].notna()) & (df["date"] > hard_stop_date)
    # df.loc[mask_inactive, "is_active"] = 0
    # df["is_active"] = df["is_active"].astype('object')

    # total_lifespan_days = df["end_production_date"] - df["start_production_date"]
    # df["total_production_lifespan"] = total_lifespan_days.dt.days / 30.0
    # df["lifecycle_progress"] = df["months_since_startprod"] / df["total_production_lifespan"]

    
    return df

train = add_time_and_lifecycle_features(train)
# sample_submission = add_time_and_lifecycle_features(sample_submission)
train.head()

In [None]:
train.info()

In [None]:
ts_train = TimeSeriesDataFrame.from_data_frame(
    train,
    id_column="unique_code",
    timestamp_column="date",
    # target_column="quantity",
    # static_features_df=sku_meta
)


PRED_LEN = 12 

predictor = TimeSeriesPredictor(
    target="quantity",
    prediction_length=PRED_LEN,
    freq="MS",
    eval_metric="MAE",
)

predictor.fit(
    train_data=ts_train,
    time_limit=3600*60,   
    presets="high_quality",   
    refit_full=True,
    # hyperparameters={
    #     "Chronos": [
    #         {"model_path": "bolt_small", "ag_args": {"name_suffix": "ZeroShot"}},
    #         {"model_path": "bolt_small", "fine_tune": True, "ag_args": {"name_suffix": "FineTuned"}},
    #     ],
    #     "WaveNet": {},
    #     "TiDE": {},       
    #     "TemporalFusionTransformer": {},
    #     # "PatchTST": {},   
    #     # "DLinear": {},    
    #     "DeepAR": {},     
    #     "AutoETS": {},
    #     "NPTS": {},
    #     # "IMAPA": {},
    #     # "ADIDA": {},
    #     # "AutoARIMA":{},
    #     # "AutoCES":{},
    #     "DynamicOptimizedTheta":{},
    #     "DirectTabular": {}
    # },
    excluded_model_types=["SeasonalNaive", "RecursiveTabular"],
    # num_val_windows=2
)

In [None]:
train_max_date = train["date"].max()
print(train_max_date) 

ts_forecast = predictor.predict(ts_train)   

ts_forecast.head()

In [None]:
predictor.plot(
    data=ts_train,
    predictions=ts_forecast,
    # item_ids=data.item_ids[:2],
    # max_history_length=200,
    max_num_item_ids=100
);

In [None]:
aa = train[train.unique_code == "MKT_025-PRD_11517"][['date', 'quantity']]
aa

In [None]:
sample_sub = sample_submission.copy()
sample_sub["market_anon"] = sample_sub["unique_code"].str.split("-", expand=True)[0]
sample_sub["item_anon"]   = sample_sub["unique_code"].str.split("-", expand=True)[1]

sku_mask = sample_sub["item_anon"].str.startswith("PRD_")
sub_sku  = sample_sub[sku_mask].copy()


pred_sku_df = ts_forecast.reset_index()

pred_sku_df = pred_sku_df.rename(columns={
    "item_id": "unique_code",
    "timestamp": "date",
    "mean": "quantity_pred",   
})
sub_sku = sub_sku.merge(
    pred_sku_df,
    on=["unique_code", "date"],
    how="left",
)

sub_sku["quantity_pred"] = sub_sku["quantity_pred"].clip(lower=0)

In [None]:
train_uc = set(train.unique_code.unique())
sub_uc = set(sample_submission.unique_code.unique())
common_uc = train_uc.intersection(sub_uc)

sub_only_uc = sub_uc - train_uc

print(f"Train uc: {len(train_uc):,}")
print(f"Sub uc: {len(sub_uc):,}")
print(f"Common sessions: {len(common_uc):,}")
print(f"Percentage of train sessions in test: {len(common_uc)/len(train_uc)*100:.2f}%")
print(f"Percentage of test sessions in train: {len(common_uc)/len(sub_uc)*100:.2f}%")
print(len(sub_only_uc))

In [None]:
sub_sku["quantity_pred"].isnull().sum()

In [None]:
prod_to_cat = product_master.set_index("product_code")["category"]


# pred_sku_df['market'] = pred_sku_df['unique_code'].str.split("-", expand=True)[0]
# pred_sku_df['product_code'] = pred_sku_df['unique_code'].str.split("-", expand=True)[1]
# pred_sku_df["category"] = pred_sku_df["product_code"].map(prod_to_cat)

# cat_forecast = (
#     pred_sku_df
#     .groupby(["market", "category", "date"], as_index=False)["quantity_pred"]
#     .sum()
# )

sub_sku['market'] = sub_sku['unique_code'].str.split("-", expand=True)[0]
sub_sku['product_code'] = sub_sku['unique_code'].str.split("-", expand=True)[1]
sub_sku["category"] = sub_sku["product_code"].map(prod_to_cat)

cat_forecast = (
    sub_sku
    .groupby(["market", "category", "date"], as_index=False)["quantity_pred"]
    .sum()
)

cat_mask = sample_sub["item_anon"].str.startswith("CAT_")
sub_cat  = sample_sub[cat_mask].copy()
sub_cat["market"]   = sub_cat["market_anon"]
sub_cat["category"] = sub_cat["item_anon"]

sub_cat = sub_cat.merge(
    cat_forecast,
    on=["market", "category", "date"],
    how="left",
)

sub_cat["quantity_pred"] = sub_cat["quantity_pred"].fillna(0).clip(lower=0)

In [None]:
sub_all = sample_sub.copy()
sub_all["quantity_pred"] = np.nan

sub_all.loc[sku_mask, "quantity_pred"] = sub_sku["quantity_pred"].values
sub_all.loc[cat_mask, "quantity_pred"] = sub_cat["quantity_pred"].values

final_submission = sub_all[["ID", "unique_code", "date"]].copy()
final_submission["quantity"] = sub_all["quantity_pred"].values

final_submission["quantity"] = final_submission["quantity"].fillna(0).clip(lower=0)


final_submission.to_csv("submission_autogluon_ts.csv", index=False)
print("Saved submission_autogluon_ts.csv")


In [None]:
aa = final_submission[(final_submission.quantity<0.01) & (final_submission.quantity>0.0)]
len(aa)