# Phase 2: Baseline Modeling

**Objective**: Establish benchmark metrics (MAE, RMSE, MAPE) using naive and statistical methods. These baselines will be used to evaluate advanced models in Phase 3.

## Table of Contents
1. Setup & Data Loading (from processed/)
2. Train/Test Split
3. Naive Forecasts (Naive-1, Naive-7, Moving Average)
4. Statistical Baseline: SARIMA
5. Cross-Validation with TimeSeriesSplit
6. Model Comparison & Visualization
7. Save Results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pathlib import Path
import warnings
import logging

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Visualization config
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 11

# Paths
PROJECT_ROOT = Path("..").resolve()
PROCESSED_DIR = PROJECT_ROOT / "Data" / "processed"
RESULTS_DIR = PROJECT_ROOT / "results"
RESULTS_DIR.mkdir(exist_ok=True)

def mean_absolute_percentage_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """Calculate MAPE, handling zeros by filtering them out."""
    mask = y_true != 0
    if mask.sum() == 0:
        return np.nan
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

# Load cleaned data from Phase 1
data_path = PROCESSED_DIR / "daily_sales_clean.parquet"
if data_path.exists():
    df = pd.read_parquet(data_path)
    print(f"‚úÖ Loaded cleaned data from: {data_path}")
else:
    # Fallback to original processed data
    data_path = PROCESSED_DIR / "daily_sales.parquet"
    df = pd.read_parquet(data_path)
    print(f"‚ö†Ô∏è Loaded original data from: {data_path}")

df['date'] = pd.to_datetime(df['date'])
df = df.set_index('date').sort_index()

print(f"Date range: {df.index.min().date()} to {df.index.max().date()}")
print(f"Total days: {len(df)}")
print(f"\nData preview:")
df.head()

## 2. Train/Test Split

Using 80/20 split, respecting time order (no shuffling).

In [None]:
# Train/Test split (80/20)
train_size = int(len(df) * 0.8)
train = df.iloc[:train_size].copy()
test = df.iloc[train_size:].copy()

print(f"Train: {train.index.min().date()} to {train.index.max().date()} ({len(train)} days)")
print(f"Test:  {test.index.min().date()} to {test.index.max().date()} ({len(test)} days)")

# Visualize split
plt.figure(figsize=(14, 4))
plt.plot(train.index, train['sales'], label='Train', color='blue')
plt.plot(test.index, test['sales'], label='Test', color='orange')
plt.axvline(x=train.index.max(), color='red', linestyle='--', label='Train/Test Split')
plt.title("Train/Test Split Visualization")
plt.ylabel("Sales (PLN)")
plt.legend()
plt.show()

## 3. Naive Forecasts

- **Naive-1**: Tomorrow = Today (persistence model)
- **Naive-7**: Tomorrow = Same day last week (seasonal naive)
- **Moving Average**: Tomorrow = Average of last 7 days

In [None]:
# Store results
results = []
predictions = {}

# ============================================
# NAIVE-1: Yesterday's value
# ============================================
# For each test day, predict using the previous day's actual value
full_series = pd.concat([train['sales'], test['sales']])
naive1_pred = full_series.shift(1).loc[test.index]

naive1_mae = mean_absolute_error(test['sales'], naive1_pred)
naive1_rmse = np.sqrt(mean_squared_error(test['sales'], naive1_pred))
naive1_mape = mean_absolute_percentage_error(test['sales'].values, naive1_pred.values)

results.append({'Model': 'Naive-1', 'MAE': naive1_mae, 'RMSE': naive1_rmse, 'MAPE': naive1_mape})
predictions['Naive-1'] = naive1_pred
print(f"Naive-1:        MAE={naive1_mae:,.0f}, RMSE={naive1_rmse:,.0f}, MAPE={naive1_mape:.1f}%")

# ============================================
# NAIVE-7: Same day last week
# ============================================
naive7_pred = full_series.shift(7).loc[test.index]

naive7_mae = mean_absolute_error(test['sales'], naive7_pred)
naive7_rmse = np.sqrt(mean_squared_error(test['sales'], naive7_pred))
naive7_mape = mean_absolute_percentage_error(test['sales'].values, naive7_pred.values)

results.append({'Model': 'Naive-7', 'MAE': naive7_mae, 'RMSE': naive7_rmse, 'MAPE': naive7_mape})
predictions['Naive-7'] = naive7_pred
print(f"Naive-7:        MAE={naive7_mae:,.0f}, RMSE={naive7_rmse:,.0f}, MAPE={naive7_mape:.1f}%")

# ============================================
# MOVING AVERAGE (7-day rolling)
# ============================================
# For each test day, predict using the rolling mean of the previous 7 days
ma_pred = full_series.rolling(window=7).mean().shift(1).loc[test.index]

ma_mae = mean_absolute_error(test['sales'], ma_pred)
ma_rmse = np.sqrt(mean_squared_error(test['sales'], ma_pred))
ma_mape = mean_absolute_percentage_error(test['sales'].values, ma_pred.values)

results.append({'Model': 'Moving Avg (7)', 'MAE': ma_mae, 'RMSE': ma_rmse, 'MAPE': ma_mape})
predictions['Moving Avg (7)'] = ma_pred
print(f"Moving Avg (7): MAE={ma_mae:,.0f}, RMSE={ma_rmse:,.0f}, MAPE={ma_mape:.1f}%")

## 4. Statistical Baseline: SARIMA

Seasonal ARIMA with weekly seasonality (period=7). Using order (1,1,1) and seasonal order (1,1,1,7).

In [None]:
# ============================================
# SARIMA (1,1,1)(1,1,1,7)
# ============================================
print("Fitting SARIMA model... (this may take a moment)")

try:
    sarima_model = SARIMAX(
        train['sales'], 
        order=(1, 1, 1), 
        seasonal_order=(1, 1, 1, 7),
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    sarima_fit = sarima_model.fit(disp=False)
    
    # Forecast
    sarima_pred = sarima_fit.forecast(steps=len(test))
    sarima_pred.index = test.index
    
    sarima_mae = mean_absolute_error(test['sales'], sarima_pred)
    sarima_rmse = np.sqrt(mean_squared_error(test['sales'], sarima_pred))
    sarima_mape = mean_absolute_percentage_error(test['sales'].values, sarima_pred.values)
    
    results.append({'Model': 'SARIMA', 'MAE': sarima_mae, 'RMSE': sarima_rmse, 'MAPE': sarima_mape})
    predictions['SARIMA'] = sarima_pred
    print(f"SARIMA:         MAE={sarima_mae:,.0f}, RMSE={sarima_rmse:,.0f}, MAPE={sarima_mape:.1f}%")
    
    # Model summary
    print("\nSARIMA Model Summary:")
    print(f"  AIC: {sarima_fit.aic:.0f}")
    print(f"  BIC: {sarima_fit.bic:.0f}")
    
except Exception as e:
    print(f"SARIMA fitting failed: {e}")
    sarima_pred = None

## 5. Cross-Validation with TimeSeriesSplit

Validating model performance across multiple time periods to ensure robustness.

In [None]:
# Time Series Cross-Validation
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

cv_results = {
    'Naive-1': [],
    'Naive-7': [],
    'Moving Avg (7)': []
}

print(f"Running {n_splits}-fold Time Series Cross-Validation...")
print("="*50)

for fold, (train_idx, val_idx) in enumerate(tscv.split(df)):
    cv_train = df.iloc[train_idx]
    cv_val = df.iloc[val_idx]
    
    # Skip if validation set is too small
    if len(cv_val) < 7:
        continue
    
    cv_full = pd.concat([cv_train['sales'], cv_val['sales']])
    
    # Naive-1
    pred_naive1 = cv_full.shift(1).iloc[len(cv_train):]
    cv_results['Naive-1'].append(mean_absolute_error(cv_val['sales'], pred_naive1))
    
    # Naive-7
    pred_naive7 = cv_full.shift(7).iloc[len(cv_train):]
    valid_mask = ~pred_naive7.isna()
    if valid_mask.sum() > 0:
        cv_results['Naive-7'].append(mean_absolute_error(cv_val['sales'][valid_mask], pred_naive7[valid_mask]))
    
    # Moving Average
    pred_ma = cv_full.rolling(7).mean().shift(1).iloc[len(cv_train):]
    valid_mask = ~pred_ma.isna()
    if valid_mask.sum() > 0:
        cv_results['Moving Avg (7)'].append(mean_absolute_error(cv_val['sales'][valid_mask], pred_ma[valid_mask]))
    
    print(f"Fold {fold+1}: Train={len(cv_train)} days, Val={len(cv_val)} days")

# Summary
print("\n" + "="*50)
print("CROSS-VALIDATION RESULTS (MAE)")
print("="*50)
cv_summary = []
for model, scores in cv_results.items():
    if scores:
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        cv_summary.append({
            'Model': model,
            'CV Mean MAE': mean_score,
            'CV Std MAE': std_score
        })
        print(f"{model:15s}: {mean_score:,.0f} ¬± {std_score:,.0f}")

cv_df = pd.DataFrame(cv_summary)

## 6. Model Comparison & Visualization

Comparing all baseline models on the test set with comprehensive visualizations.

In [None]:
# Results summary table
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('MAE')
print("="*60)
print("BASELINE MODEL COMPARISON (Test Set)")
print("="*60)
print(results_df.to_string(index=False))

# Identify best model
best_model = results_df.iloc[0]['Model']
print(f"\nüèÜ Best Baseline Model: {best_model}")

# Bar chart comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# MAE
colors = ['green' if m == best_model else 'steelblue' for m in results_df['Model']]
axes[0].barh(results_df['Model'], results_df['MAE'], color=colors)
axes[0].set_xlabel('MAE (PLN)')
axes[0].set_title('Mean Absolute Error')
axes[0].invert_yaxis()

# RMSE
axes[1].barh(results_df['Model'], results_df['RMSE'], color=colors)
axes[1].set_xlabel('RMSE (PLN)')
axes[1].set_title('Root Mean Squared Error')
axes[1].invert_yaxis()

# MAPE
axes[2].barh(results_df['Model'], results_df['MAPE'], color=colors)
axes[2].set_xlabel('MAPE (%)')
axes[2].set_title('Mean Absolute Percentage Error')
axes[2].invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Time series comparison plot - all predictions vs actual
fig, ax = plt.subplots(figsize=(16, 6))

# Actual values
ax.plot(test.index, test['sales'], 'k-', linewidth=2, label='Actual', alpha=0.8)

# Predictions
colors = plt.cm.tab10(np.linspace(0, 1, len(predictions)))
for (name, pred), color in zip(predictions.items(), colors):
    ax.plot(test.index, pred, '--', linewidth=1.5, label=name, color=color, alpha=0.7)

ax.set_title("Baseline Models: Actual vs Predictions (Test Set)")
ax.set_xlabel("Date")
ax.set_ylabel("Sales (PLN)")
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Zoomed view - first 30 days of test
fig, ax = plt.subplots(figsize=(16, 5))
zoom_days = 30

ax.plot(test.index[:zoom_days], test['sales'].iloc[:zoom_days], 'ko-', linewidth=2, markersize=4, label='Actual')
for (name, pred), color in zip(predictions.items(), colors):
    ax.plot(test.index[:zoom_days], pred.iloc[:zoom_days], '--', linewidth=1.5, label=name, color=color, alpha=0.7)

ax.set_title(f"Zoomed View: First {zoom_days} Days of Test Set")
ax.set_xlabel("Date")
ax.set_ylabel("Sales (PLN)")
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Save Results

Saving baseline results for comparison with advanced models in Phase 3.

In [None]:
# Save results to CSV
results_path = RESULTS_DIR / "baseline_results.csv"
results_df.to_csv(results_path, index=False)
print(f"‚úÖ Saved baseline results to: {results_path}")

# Save predictions for later analysis
predictions_df = pd.DataFrame(predictions)
predictions_df['actual'] = test['sales'].values
predictions_df['date'] = test.index
predictions_df = predictions_df[['date', 'actual'] + list(predictions.keys())]
predictions_path = RESULTS_DIR / "baseline_predictions.csv"
predictions_df.to_csv(predictions_path, index=False)
print(f"‚úÖ Saved predictions to: {predictions_path}")

# Summary for Phase 3
print("\n" + "="*60)
print("SUMMARY FOR PHASE 3")
print("="*60)
print(f"""
Best Baseline Model: {best_model}
Best MAE: {results_df.iloc[0]['MAE']:,.0f} PLN
Best RMSE: {results_df.iloc[0]['RMSE']:,.0f} PLN
Best MAPE: {results_df.iloc[0]['MAPE']:.1f}%

TARGET FOR ADVANCED MODELS:
- Beat baseline MAE of {results_df.iloc[0]['MAE']:,.0f} PLN
- Target: >10% improvement = MAE < {results_df.iloc[0]['MAE'] * 0.9:,.0f} PLN
""")