In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

# -------------------------------------------------------------------------
# 1. LOAD DATA
# -------------------------------------------------------------------------
# Train: 1913 days (d_1 ... d_1913)
train_df = pd.read_csv("sales_train_validation_afcs2025.csv")

# Validation (Ground Truth): 28 days (d_1914 ... d_1941)
val_df = pd.read_csv("sales_test_validation_afcs2025.csv")

# Sort/Ensure alignment of d_cols
d_cols = [c for c in train_df.columns if c.startswith("d_")]
d_cols = sorted(d_cols, key=lambda x: int(x.split('_')[1]))

train_data = train_df[d_cols].values
ids = train_df['id'].values
H = 28 # Horizon

# -------------------------------------------------------------------------
# 2. SNAIVE MODEL (Lag 364)
# -------------------------------------------------------------------------
# "lag('year')" corresponds to 52 weeks * 7 days = 364 days.
# This preserves the Day-of-Week alignment (e.g., Monday vs Monday).
SEASON_LAG = 364

# We take the slice of history from [T - 364] to [T - 364 + 28]
# to predict [T + 1] to [T + 28].
snaive_preds = train_data[:, -SEASON_LAG : -SEASON_LAG + H]

import numpy as np

# -------------------------------------------------------------------------
# SNAIVE MODEL (Weekly Seasonality)
# -------------------------------------------------------------------------
# "lag('week')" corresponds to 7 days.
SEASON_LAG = 7

# Since the Horizon (28) > Season (7), we cannot just slice one instance.
# We take the last 7 days of history and repeat them to fill 28 days.
# Logic: Take [-7:] and tile it 4 times (since 7 * 4 = 28).
last_week_pattern = train_data[:, -SEASON_LAG:]
snaive_weekly_preds = np.tile(last_week_pattern, (1, 4)) # Result shape: [Batch, 28]

# -------------------------------------------------------------------------
# SNAIVE MODEL (Monthly Seasonality)
# -------------------------------------------------------------------------
# "lag('month')" is approx 30 days, but 28 days (4 weeks) is preferred
# to maintain Day-of-Week alignment (e.g. Monday to Monday).
SEASON_LAG = 28

# We take the slice of history from [T - 28] to [T]
# to predict [T + 1] to [T + 28].
# Note: Since SEASON_LAG == H, this slices exactly the last 28 days.
snaive_monthly_preds = train_data[:, -SEASON_LAG : ]

# -------------------------------------------------------------------------
# 3. CROSTON MODEL
# -------------------------------------------------------------------------
def apply_croston(x, alpha=0.1, h=28):
    """
    Simple Croston's Method implementation.
    Separates demand into size (q) and interval (a).
    """
    # Find non-zero indices
    nz_idx = np.nonzero(x)[0]

    if len(nz_idx) == 0:
        return np.zeros(h)

    # 1. Intervals (difference between non-zero indices)
    # We prepend -1 so the first interval is the time to the first sale
    intervals = np.diff(np.r_[-1, nz_idx])

    # 2. Demand Sizes
    q = x[nz_idx]

    # 3. Recursive Smoothing
    # Initialize with the first observed values
    z = q[0]     # Smoothed Size
    p = intervals[0] # Smoothed Interval

    # Update loop starting from 2nd non-zero observation
    for i in range(1, len(nz_idx)):
        z = alpha * q[i] + (1 - alpha) * z
        p = alpha * intervals[i] + (1 - alpha) * p

    # Final Forecast = Smoothed Size / Smoothed Interval
    forecast_val = z / p
    return np.full(h, forecast_val)

# Apply to all rows
croston_preds = np.array([apply_croston(row) for row in train_data])

# -------------------------------------------------------------------------
# NAIVE MODEL
# -------------------------------------------------------------------------
# We take the last observed value [-1] and repeat it for the horizon H.

# 1. Get the last value. Shape becomes (n_series, 1)
last_value = train_data[:, -1:]

# 2. Tile it to match horizon. Shape becomes (n_series, 28)
naive_preds = np.tile(last_value, (1, H))

# -------------------------------------------------------------------------
# GLOBAL MEAN MODEL
# -------------------------------------------------------------------------
# We calculate the mean over the time axis (axis 1).

# 1. Calculate mean. keepdims=True ensures shape is (n_series, 1)
series_mean = np.mean(train_data, axis=1, keepdims=True)

# 2. Tile it to match horizon. Shape becomes (n_series, 28)
mean_preds = np.tile(series_mean, (1, H))

# -------------------------------------------------------------------------
# RW WITH DRIFT (Slope: First to Last)
# -------------------------------------------------------------------------

# 1. Get the number of time steps (T) in the training history
n_timesteps = train_data.shape[1]

# 2. Calculate the slope for each series
#    Slope = (Last Value - First Value) / (Total Time Steps - 1)
#    Note: We subtract 1 because there are T-1 intervals between T points.
y_last = train_data[:, -1:]
y_first = train_data[:, :1]

# Shape (n_series, 1)
slope = (y_last - y_first) / (n_timesteps - 1)

# 3. Create a horizon index vector [1, 2, ..., 28]
#    Shape (1, 28)
horizon_indices = np.arange(1, H + 1).reshape(1, H)

# 4. Calculate Forecast
#    Pred = Last Value + (Slope * Horizon_Step)
#    Shape logic: (N, 1) + (N, 1) * (1, 28) -> (N, 28)
drift_preds = y_last + (slope * horizon_indices)

# -------------------------------------------------------------------------
# 4. SAVE SUBMISSIONS & CALCULATE RMSE
# -------------------------------------------------------------------------
f_cols = [f"F{i}" for i in range(1, 29)]

# Save SNAIVE yearly
sub_snaive = pd.DataFrame(snaive_preds, columns=f_cols)
sub_snaive.insert(0, 'id', ids)
sub_snaive.to_csv("submission_yearly_snaive.csv", index=False)

# Save SNAIVE weekly
sub_snaive = pd.DataFrame(snaive_weekly_preds, columns=f_cols)
sub_snaive.insert(0, 'id', ids)
sub_snaive.to_csv("submission_weekly_snaive.csv", index=False)

# Save SNAIVE
sub_snaive = pd.DataFrame(snaive_monthly_preds, columns=f_cols)
sub_snaive.insert(0, 'id', ids)
sub_snaive.to_csv("submission_monthly_snaive.csv", index=False)

# Save Croston
sub_croston = pd.DataFrame(croston_preds, columns=f_cols)
sub_croston.insert(0, 'id', ids)
sub_croston.to_csv("submission_croston.csv", index=False)

# Evaluation
# Align Ground Truth by ID
val_df_sorted = val_df.set_index('id').reindex(ids).reset_index()
y_true = val_df_sorted.drop(columns=['id']).values

rmse_snaive = sqrt(mean_squared_error(y_true, snaive_preds))
rmse_snaive_w = sqrt(mean_squared_error(y_true, snaive_weekly_preds))
rmse_snaive_m = sqrt(mean_squared_error(y_true, snaive_monthly_preds))
rmse_naive = sqrt(mean_squared_error(y_true, naive_preds))
rmse_mean = sqrt(mean_squared_error(y_true, mean_preds))
rmse_drif = sqrt(mean_squared_error(y_true, drift_preds))
rmse_croston = sqrt(mean_squared_error(y_true, croston_preds))


print("Validation Set results:")
print(f"SNAIVE RMSE Yearly: {rmse_snaive:.4f}")
print(f"SNAIVE RMSE Weekly: {rmse_snaive_w:.4f}")
print(f"SNAIVE RMSE Monthly: {rmse_snaive_m:.4f}")
print(f"NAIVE RMSE: {rmse_naive:.4f}")
print(f"MEAN RMSE: {rmse_mean:.4f}")
print(f"DRIFT RMSE: {rmse_drif:.4f}")
print(f"Croston RMSE: {rmse_croston:.4f}")