In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

# -------------------------------------------------------------------------
# 1. LOAD DATA & CREATE FULL HISTORY
# -------------------------------------------------------------------------
# Load Train (d_1 - d_1913)
train_df = pd.read_csv("sales_train_validation_afcs2025.csv")

# Load Validation (d_1914 - d_1941) - This is now part of history
valid_df = pd.read_csv("sales_test_validation_afcs2025.csv")

# Load Test (d_1942 - d_1969) - This is our NEW Ground Truth
test_df = pd.read_csv("sales_test_evaluation_afcs_2025.csv")

# Extract 'd_' columns and ensure they are sorted numerically
def get_d_cols(df):
    cols = [c for c in df.columns if c.startswith("d_")]
    return sorted(cols, key=lambda x: int(x.split('_')[1]))

train_cols = get_d_cols(train_df)
valid_cols = get_d_cols(valid_df)
test_cols = get_d_cols(test_df)

# Align everything to the training IDs
ids = train_df['id'].values

# Helper to align and extract data arrays
def get_data_aligned(source_df, target_ids, cols):
    # Set index to ID, reindex to match target_ids, and extract values
    return source_df.set_index('id').reindex(target_ids)[cols].values

# 1. Create History Data: Train + Validation (Concatenate along time axis)
# Shape: (n_series, 1913 + 28) = (n_series, 1941)
data_train = train_df.set_index('id').reindex(ids)[train_cols].values
data_valid = get_data_aligned(valid_df, ids, valid_cols)
history_data = np.concatenate([data_train, data_valid], axis=1)

# 2. Get Ground Truth for Test Period
y_true = get_data_aligned(test_df, ids, test_cols)

H = 28  # Horizon

# -------------------------------------------------------------------------
# 2. MODELS (Applied to history_data)
# -------------------------------------------------------------------------

# --- A. SNAIVE MODEL (Yearly: Lag 364) ---
SEASON_LAG_YEAR = 364
# We take the slice from [End - 364] to [End - 364 + 28]
snaive_yearly_preds = history_data[:, -SEASON_LAG_YEAR : -SEASON_LAG_YEAR + H]

# --- B. SNAIVE MODEL (Weekly: Lag 7) ---
SEASON_LAG_WEEK = 7
# Take last 7 days of history and tile 4 times (7*4=28)
last_week_pattern = history_data[:, -SEASON_LAG_WEEK:]
snaive_weekly_preds = np.tile(last_week_pattern, (1, 4))

# --- C. SNAIVE MODEL (Monthly: Lag 28) ---
SEASON_LAG_MONTH = 28
# Take exactly the last 28 days of history
snaive_monthly_preds = history_data[:, -SEASON_LAG_MONTH : ]

# --- D. NAIVE MODEL ---
# Last observed value (d_1941) repeated
last_value = history_data[:, -1:]
naive_preds = np.tile(last_value, (1, H))

# --- E. GLOBAL MEAN MODEL ---
# Mean of entire history (d_1 ... d_1941)
series_mean = np.mean(history_data, axis=1, keepdims=True)
mean_preds = np.tile(series_mean, (1, H))

# --- F. RW WITH DRIFT ---
# Slope from d_1 to d_1941
n_timesteps = history_data.shape[1]
y_last = history_data[:, -1:]
y_first = history_data[:, :1] # d_1

slope = (y_last - y_first) / (n_timesteps - 1)
horizon_indices = np.arange(1, H + 1).reshape(1, H)
drift_preds = y_last + (slope * horizon_indices)

# --- G. CROSTON MODEL ---
def apply_croston(x, alpha=0.1, h=28):
    nz_idx = np.nonzero(x)[0]
    if len(nz_idx) == 0:
        return np.zeros(h)

    # Intervals & Demand
    intervals = np.diff(np.r_[-1, nz_idx])
    q = x[nz_idx]

    # Initialization
    z = q[0]
    p = intervals[0]

    # Recursion
    for i in range(1, len(nz_idx)):
        z = alpha * q[i] + (1 - alpha) * z
        p = alpha * intervals[i] + (1 - alpha) * p

    forecast_val = z / p
    return np.full(h, forecast_val)

croston_preds = np.array([apply_croston(row, h=H) for row in history_data])

# -------------------------------------------------------------------------
# 3. CALCULATE RMSE
# -------------------------------------------------------------------------
print(f"{'Model':<20} | RMSE")
print("-" * 30)

models = {
    "SNaive (Yearly)": snaive_yearly_preds,
    "SNaive (Weekly)": snaive_weekly_preds,
    "SNaive (Monthly)": snaive_monthly_preds,
    "Naive": naive_preds,
    "Mean": mean_preds,
    "Drift": drift_preds,
    "Croston": croston_preds
}

for name, preds in models.items():
    rmse = sqrt(mean_squared_error(y_true, preds))
    print(f"{name:<20} | {rmse:.4f}")

# -------------------------------------------------------------------------
# 4. SAVE SUBMISSIONS (Optional)
# -------------------------------------------------------------------------
f_cols = [f"F{i}" for i in range(1, 29)]
# Example: Saving Yearly SNaive
sub = pd.DataFrame(snaive_yearly_preds, columns=f_cols)
sub.insert(0, 'id', ids)
sub.to_csv("submission_test_snaive_yearly.csv", index=False)