# 07 - Demand Forecasting Experiments

## Purpose
Develop and test demand forecasting models for inventory optimization.

## Models
1. Moving Average
2. Exponential Smoothing
3. Linear Regression
4. Model Comparison

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Load data
DATA_DIR = Path('../..') / 'ml' / 'data' / 'processed'

orders = pd.read_csv(DATA_DIR / 'orders.csv')
order_items = pd.read_csv(DATA_DIR / 'order_items.csv')
products = pd.read_csv(DATA_DIR / 'products.csv')

# Parse dates
orders['OrderDate'] = pd.to_datetime(orders['OrderDate'])

# Merge
full_orders = orders.merge(order_items, on='OrderID')
full_orders = full_orders.merge(products, on='ProductID')

print(f"Order items: {len(full_orders):,}")
print(f"Date range: {orders['OrderDate'].min()} to {orders['OrderDate'].max()}")

In [None]:
# Prepare daily demand data
daily_demand = full_orders.groupby([full_orders['OrderDate'].dt.date, 'ProductID'])['Quantity'].sum().reset_index()
daily_demand.columns = ['Date', 'ProductID', 'Quantity']
daily_demand['Date'] = pd.to_datetime(daily_demand['Date'])

# Get top products for forecasting
top_products = full_orders.groupby('ProductID')['Quantity'].sum().nlargest(10).index.tolist()

print(f"Top 10 products by demand: {top_products}")

## 1. Moving Average Forecasting

In [None]:
def moving_average_forecast(data, window=7, forecast_days=7):
    """
    Simple moving average forecast.
    """
    # Calculate moving average
    ma = data['Quantity'].rolling(window=window).mean()
    
    # Use last MA value for forecast
    last_ma = ma.iloc[-1]
    
    # Generate forecast
    last_date = data['Date'].max()
    forecast_dates = [last_date + timedelta(days=i+1) for i in range(forecast_days)]
    forecast_values = [last_ma] * forecast_days
    
    return pd.DataFrame({
        'Date': forecast_dates,
        'Forecast': forecast_values
    }), ma

# Test on first top product
test_product = top_products[0]
product_data = daily_demand[daily_demand['ProductID'] == test_product].sort_values('Date')

# Fill missing dates with 0
date_range = pd.date_range(product_data['Date'].min(), product_data['Date'].max())
product_data = product_data.set_index('Date').reindex(date_range, fill_value=0).reset_index()
product_data.columns = ['Date', 'ProductID', 'Quantity']
product_data['ProductID'] = test_product

# Forecast
forecast_7, ma_7 = moving_average_forecast(product_data, window=7)
forecast_14, ma_14 = moving_average_forecast(product_data, window=14)
forecast_30, ma_30 = moving_average_forecast(product_data, window=30)

plt.figure(figsize=(14, 6))
plt.plot(product_data['Date'], product_data['Quantity'], alpha=0.5, label='Actual')
plt.plot(product_data['Date'], ma_7, label='7-day MA', linewidth=2)
plt.plot(product_data['Date'], ma_14, label='14-day MA', linewidth=2)
plt.plot(product_data['Date'], ma_30, label='30-day MA', linewidth=2)
plt.title(f'Moving Average Analysis - Product {test_product}')
plt.xlabel('Date')
plt.ylabel('Quantity')
plt.legend()
plt.tight_layout()
plt.show()

## 2. Exponential Smoothing

In [None]:
def exponential_smoothing_forecast(data, alpha=0.3, forecast_days=7):
    """
    Simple exponential smoothing forecast.
    """
    values = data['Quantity'].values
    smoothed = [values[0]]
    
    for i in range(1, len(values)):
        smoothed.append(alpha * values[i] + (1 - alpha) * smoothed[-1])
    
    # Forecast using last smoothed value
    last_smoothed = smoothed[-1]
    last_date = data['Date'].max()
    forecast_dates = [last_date + timedelta(days=i+1) for i in range(forecast_days)]
    forecast_values = [last_smoothed] * forecast_days
    
    return pd.DataFrame({
        'Date': forecast_dates,
        'Forecast': forecast_values
    }), smoothed

# Test different alpha values
alphas = [0.1, 0.3, 0.5, 0.7]

plt.figure(figsize=(14, 6))
plt.plot(product_data['Date'], product_data['Quantity'], alpha=0.4, label='Actual')

for alpha in alphas:
    forecast, smoothed = exponential_smoothing_forecast(product_data, alpha=alpha)
    plt.plot(product_data['Date'], smoothed, label=f'α={alpha}', linewidth=2)

plt.title(f'Exponential Smoothing Analysis - Product {test_product}')
plt.xlabel('Date')
plt.ylabel('Quantity')
plt.legend()
plt.tight_layout()
plt.show()

## 3. Linear Regression Forecasting

In [None]:
def linear_regression_forecast(data, forecast_days=7):
    """
    Linear regression based forecast.
    """
    # Create features
    data = data.copy()
    data['DayIndex'] = (data['Date'] - data['Date'].min()).dt.days
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data['Month'] = data['Date'].dt.month
    
    X = data[['DayIndex', 'DayOfWeek', 'Month']].values
    y = data['Quantity'].values
    
    # Fit model
    model = LinearRegression()
    model.fit(X, y)
    
    # Predict on training data
    y_pred = model.predict(X)
    
    # Generate forecast
    last_date = data['Date'].max()
    last_day_index = data['DayIndex'].max()
    
    forecast_dates = [last_date + timedelta(days=i+1) for i in range(forecast_days)]
    forecast_features = []
    for i, d in enumerate(forecast_dates):
        forecast_features.append([last_day_index + i + 1, d.dayofweek, d.month])
    
    forecast_values = model.predict(np.array(forecast_features))
    forecast_values = np.maximum(forecast_values, 0)  # No negative forecasts
    
    return pd.DataFrame({
        'Date': forecast_dates,
        'Forecast': forecast_values
    }), y_pred, model

# Apply to product data
lr_forecast, lr_pred, lr_model = linear_regression_forecast(product_data)

plt.figure(figsize=(14, 6))
plt.plot(product_data['Date'], product_data['Quantity'], alpha=0.5, label='Actual')
plt.plot(product_data['Date'], lr_pred, label='Linear Regression Fit', linewidth=2, color='red')
plt.plot(lr_forecast['Date'], lr_forecast['Forecast'], '--', label='Forecast', linewidth=2, color='green')
plt.title(f'Linear Regression Forecast - Product {test_product}')
plt.xlabel('Date')
plt.ylabel('Quantity')
plt.legend()
plt.tight_layout()
plt.show()

print(f"\nLinear Regression Coefficients:")
print(f"  Intercept: {lr_model.intercept_:.4f}")
print(f"  DayIndex: {lr_model.coef_[0]:.4f}")
print(f"  DayOfWeek: {lr_model.coef_[1]:.4f}")
print(f"  Month: {lr_model.coef_[2]:.4f}")

## 4. Model Comparison

In [None]:
def evaluate_forecast_model(actual, predicted, model_name):
    """
    Calculate forecast accuracy metrics.
    """
    # Remove NaN values
    mask = ~np.isnan(predicted)
    actual = np.array(actual)[mask]
    predicted = np.array(predicted)[mask]
    
    mae = mean_absolute_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mape = np.mean(np.abs((actual - predicted) / np.maximum(actual, 1))) * 100
    r2 = r2_score(actual, predicted)
    
    return {
        'Model': model_name,
        'MAE': mae,
        'RMSE': rmse,
        'MAPE': mape,
        'R2': r2
    }

# Train-test split (use last 30 days as test)
train_data = product_data.iloc[:-30]
test_data = product_data.iloc[-30:]

# Get predictions for each model on test period
results = []

# Moving Average
ma_window = 7
ma_pred = product_data['Quantity'].rolling(window=ma_window).mean().iloc[-30:].values
results.append(evaluate_forecast_model(test_data['Quantity'].values, ma_pred, 'Moving Average (7-day)'))

# Exponential Smoothing
_, es_smoothed = exponential_smoothing_forecast(product_data, alpha=0.3)
es_pred = es_smoothed[-30:]
results.append(evaluate_forecast_model(test_data['Quantity'].values, es_pred, 'Exp. Smoothing (α=0.3)'))

# Linear Regression
_, lr_all_pred, _ = linear_regression_forecast(product_data)
lr_test_pred = lr_all_pred[-30:]
results.append(evaluate_forecast_model(test_data['Quantity'].values, lr_test_pred, 'Linear Regression'))

# Display results
results_df = pd.DataFrame(results)
print("\n=== Model Comparison ===")
display(results_df.round(4))

In [None]:
# Visual comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Actual vs Predicted plots
axes[0, 0].plot(test_data['Date'], test_data['Quantity'], label='Actual', linewidth=2)
axes[0, 0].plot(test_data['Date'], ma_pred, label='MA(7)', alpha=0.7)
axes[0, 0].set_title('Moving Average Forecast')
axes[0, 0].legend()

axes[0, 1].plot(test_data['Date'], test_data['Quantity'], label='Actual', linewidth=2)
axes[0, 1].plot(test_data['Date'], es_pred, label='ES(0.3)', alpha=0.7)
axes[0, 1].set_title('Exponential Smoothing Forecast')
axes[0, 1].legend()

axes[1, 0].plot(test_data['Date'], test_data['Quantity'], label='Actual', linewidth=2)
axes[1, 0].plot(test_data['Date'], lr_test_pred, label='LR', alpha=0.7)
axes[1, 0].set_title('Linear Regression Forecast')
axes[1, 0].legend()

# Metrics comparison
x = np.arange(len(results_df))
width = 0.35
axes[1, 1].bar(x - width/2, results_df['MAE'], width, label='MAE')
axes[1, 1].bar(x + width/2, results_df['RMSE'], width, label='RMSE')
axes[1, 1].set_xticks(x)
axes[1, 1].set_xticklabels(['MA', 'ES', 'LR'])
axes[1, 1].set_title('Error Metrics Comparison')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Apply best model to multiple products
print("\n=== Forecasting Top 5 Products ===")

for product_id in top_products[:5]:
    prod_data = daily_demand[daily_demand['ProductID'] == product_id].sort_values('Date')
    
    if len(prod_data) < 30:
        print(f"\n{product_id}: Insufficient data")
        continue
    
    # Fill missing dates
    date_range = pd.date_range(prod_data['Date'].min(), prod_data['Date'].max())
    prod_data = prod_data.set_index('Date').reindex(date_range, fill_value=0).reset_index()
    prod_data.columns = ['Date', 'ProductID', 'Quantity']
    prod_data['ProductID'] = product_id
    
    # Forecast
    forecast, _, _ = linear_regression_forecast(prod_data, forecast_days=7)
    
    print(f"\n{product_id}:")
    print(f"  Average daily demand: {prod_data['Quantity'].mean():.2f}")
    print(f"  7-day forecast: {forecast['Forecast'].sum():.0f} units")
    print(f"  Forecast dates: {forecast['Date'].min().strftime('%Y-%m-%d')} to {forecast['Date'].max().strftime('%Y-%m-%d')}")

In [None]:
print("\n" + "="*60)
print("DEMAND FORECASTING SUMMARY")
print("="*60)

best_model = results_df.loc[results_df['MAE'].idxmin()]
print(f"\n=== Best Performing Model ===")
print(f"Model: {best_model['Model']}")
print(f"MAE: {best_model['MAE']:.4f}")
print(f"RMSE: {best_model['RMSE']:.4f}")
print(f"MAPE: {best_model['MAPE']:.2f}%")

print(f"\n=== Recommendations ===")
print("1. Use Moving Average for stable products with consistent demand")
print("2. Use Exponential Smoothing for products with changing trends")
print("3. Use Linear Regression for products with clear seasonality")
print("4. Consider ensemble methods for critical inventory items")