# Hierarchical Sales Forecasting: Overall, Customer Segment & SubCategory

This notebook implements grouped time series forecasting at three hierarchical levels:
1. **Overall** - Total quantity across all segments
2. **Customer Segment** - Aggregated by customer segments
3. **SubCategory** - Aggregated by product subcategories

We'll use multiple forecasting approaches and ensure hierarchical consistency.

In [28]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Analysis run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 30, Finished, Available, Finished)

Libraries imported successfully!
Analysis run on: 2025-07-20 04:40:58


In [3]:
# Check whether running in Fabric or locally, and set the data location accordingly
if "AZURE_SERVICE" in os.environ:
    is_fabric = True
    data_location = "/lakehouse/default/Files/"
    print("Running in Fabric, setting data location to /lakehouse/default/Files/")
else:
    is_fabric = False
    data_location = ""
    print("Running locally, setting data location to current directory")

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 5, Finished, Available, Finished)

Running in Fabric, setting data location to /lakehouse/default/Files/


In [4]:
# Load the combined sales economic data
data = pd.read_csv(data_location + 'forecasting/userProvidedData/combined_sales_economic_data_segmented.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')

print("=== DATASET OVERVIEW ===")
print(f"Dataset shape: {data.shape}")
print(f"Date range: {data['Date'].min()} to {data['Date'].max()}")
print(f"Unique Customer Segments: {data['CustomerSegment'].nunique()}")
print(f"Unique SubCategories: {data['SubCategoryName'].nunique()}")
print(f"\nCustomer Segments: {sorted(data['CustomerSegment'].unique())}")
print(f"\nSubCategories: {sorted(data['SubCategoryName'].unique())}")
print(f"\nData types:")
print(data.dtypes)
print(f"\nFirst few rows:")
data.head()
# pd.set_option('display.max_columns', 100)
# print(data.tail())
# pd.reset_option('display.max_columns')

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 6, Finished, Available, Finished)

=== DATASET OVERVIEW ===
Dataset shape: (6347, 48)
Date range: 2015-01-01 00:00:00 to 2024-12-01 00:00:00
Unique Customer Segments: 15
Unique SubCategories: 19

Customer Segments: ['B Customer - Norris', 'Brand Owners', 'Container Manufacturers', 'Distribution', 'Drum & Pail Manufacturers', 'Fillers', 'Global Strategic Accounts', 'Industrial Chemicals', 'InterCompany', 'Other', 'Pharmaceuticals & Nutraceuticals', 'Small Customers < $250k / year', 'Tier 1 Accounts', 'Tier 2 Accounts', 'Unknown Segment']

SubCategories: ['Components', 'Dispensing Pumps (10cc to 30cc)', 'Drum Faucets', 'Freight', 'Gaskets & Vents', 'Non-Product Sales', 'Other', 'Overcaps', 'Plugs', 'Press-Fit Closures', 'Proprietary Molded Solutions', 'Pull Out Spout Closures', 'Purchased', 'Screw Closures', 'Special Caps', 'Specialty Closures', 'Tools', 'Unknown SubCategory', 'Wrenches']

Data types:
Date                            datetime64[ns]
CustomerSegment                         object
SubCategoryName             

Unnamed: 0,Date,CustomerSegment,SubCategoryName,Total_Quantity,Avg_Quantity_Per_Transaction,Transaction_Count,Unique_Customers,Unique_Products,PP_Spot,Resin,...,Total_Quantity_3MA,Total_Quantity_6MA,Total_Quantity_12MA,Month,Quarter,Year,Economic_Momentum,Cost_Efficiency,Monthly_Total_Market,Market_Share_Percent
1879,2015-01-01,Drum & Pail Manufacturers,Specialty Closures,239000.0,119500.0,2,1,2,278.0,8075.0,...,,,,1,1,2015,5998.551387,3.677666,22965664.0,1.040684
2710,2015-01-01,Industrial Chemicals,Press-Fit Closures,344620.0,86155.0,4,4,3,278.0,8075.0,...,,,,1,1,2015,4855.595447,3.677666,22965664.0,1.500588
664,2015-01-01,Container Manufacturers,Specialty Closures,111600.0,55800.0,2,2,1,278.0,8075.0,...,,,,1,1,2015,5945.231637,3.677666,22965664.0,0.485943
779,2015-01-01,Distribution,Gaskets & Vents,220000.0,110000.0,2,2,1,278.0,8075.0,...,,,,1,1,2015,5934.825422,3.677666,22965664.0,0.957952
5932,2015-01-01,Tier 1 Accounts,Pull Out Spout Closures,62200.0,31100.0,2,1,2,278.0,8075.0,...,,,,1,1,2015,5720.00626,3.677666,22965664.0,0.270839


## Data Preparation for Hierarchical Forecasting

We'll create three levels of aggregation:
1. **Level 0 (Overall)**: Total quantity across all segments and subcategories
2. **Level 1 (Customer Segment)**: Aggregated by customer segment
3. **Level 2 (SubCategory)**: Aggregated by product subcategory

In [5]:
# Create hierarchical aggregations
print("=== CREATING HIERARCHICAL AGGREGATIONS ===")

# Holds columns and their aggregation functions
column_aggregations = {
    'Total_Quantity': 'sum',
    'Transaction_Count': 'sum',
    'Unique_Customers': 'sum',
    'Unique_Products': 'sum',
    # Economic indicators (take mean as they're external factors)
    'PP_Spot': 'mean',
    'Resin': 'mean',
    'WTI_Crude_Oil': 'mean',
    'Natural_Gas': 'mean',
    'Electricity Price': 'mean',
    'Gas Price': 'mean',
    'Energy_Average': 'mean',
    'PPI_Freight': 'mean',
    'PMI_Data': 'mean',
    'Factory_Utilization': 'mean',
    'Capacity_Utilization': 'mean',
    'Beverage': 'mean', # Additional economic indicator
    'Household_consumption': 'mean',
    'packaging': 'mean',
    'Diesel': 'mean',
    'PPI_Delivery': 'mean',
    'Oil-to-resin': 'mean',
    'Electricity Price (Lag6)': 'mean',
    'Gas Price (Lag6)': 'mean'
}

# Define exogenous variables for modeling
exog_vars = [
    'PP_Spot',
    'Resin',
    'PMI_Data',
    'Natural_Gas',
    'WTI_Crude_Oil',
    'Factory_Utilization',
    'packaging',
    'Energy_Average',
    'Electricity Price (Lag6)',
    'Gas Price (Lag6)'
]

    # 'PPI_Delivery' slightly negative
    # 'PMI_Data', major positive
    # 'PPI_Freight', negative
    # 'Factory_Utilization',
    # 'Capacity_Utilization', negative
    # 'Beverage', minor negative
    # 'Household_consumption', major negative
    # 'packaging' minor positive
    # 'Diesel', minor positive
    # 'Natural_Gas' major positive
    # 'Electricity Price (Lag6)', positive
    # 'Gas Price (Lag6)' positive


# Level 0: Overall aggregation (sum across all segments and subcategories)
overall_ts = data.groupby('Date').agg(column_aggregations).reset_index()
overall_ts['Level'] = 'Overall'
overall_ts['Segment'] = 'Total'

# # Level 1: Customer Segment aggregation
segment_ts = data.groupby(['Date', 'CustomerSegment']).agg(column_aggregations).reset_index()
segment_ts['Level'] = 'CustomerSegment'
segment_ts['Segment'] = segment_ts['CustomerSegment']

# # Level 2: SubCategory aggregation
subcategory_ts = data.groupby(['Date', 'SubCategoryName']).agg(column_aggregations).reset_index()
subcategory_ts['Level'] = 'SubCategoryName'
subcategory_ts['Segment'] = subcategory_ts['SubCategoryName']

print(f"Overall time series shape: {overall_ts.shape}")
print(f"Customer segment time series shape: {segment_ts.shape}")
print(f"SubCategory time series shape: {subcategory_ts.shape}")

# Display summary statistics
print("\n=== LEVEL SUMMARY ===")
print(f"Overall total quantity range: {overall_ts['Total_Quantity'].min():,.0f} - {overall_ts['Total_Quantity'].max():,.0f}")
print(f"Customer segments: {segment_ts['CustomerSegment'].unique()}")
print(f"SubCategories: {subcategory_ts['SubCategoryName'].unique()}")

print(overall_ts.head())

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 7, Finished, Available, Finished)

=== CREATING HIERARCHICAL AGGREGATIONS ===
Overall time series shape: (120, 26)
Customer segment time series shape: (1444, 27)
SubCategory time series shape: (1663, 27)

=== LEVEL SUMMARY ===
Overall total quantity range: 12,565,334 - 33,525,018
Customer segments: ['B Customer - Norris' 'Brand Owners' 'Container Manufacturers'
 'Distribution' 'Drum & Pail Manufacturers' 'Fillers'
 'Industrial Chemicals' 'InterCompany' 'Other'
 'Pharmaceuticals & Nutraceuticals' 'Small Customers < $250k / year'
 'Tier 1 Accounts' 'Unknown Segment' 'Global Strategic Accounts'
 'Tier 2 Accounts']
SubCategories: ['Dispensing Pumps (10cc to 30cc)' 'Drum Faucets' 'Freight'
 'Gaskets & Vents' 'Non-Product Sales' 'Other' 'Overcaps' 'Plugs'
 'Press-Fit Closures' 'Pull Out Spout Closures' 'Purchased'
 'Screw Closures' 'Specialty Closures' 'Tools' 'Wrenches' 'Components'
 'Unknown SubCategory' 'Proprietary Molded Solutions' 'Special Caps']
        Date  Total_Quantity  Transaction_Count  Unique_Customers  \
0 201

In [6]:
# Filter training data: up to 2024-12-01
full_train = overall_ts[overall_ts['Date'] >= '2015-01-01']
forecast_horizon = 12  # 12 months for 2025

# Test date range (assumes monthly frequency and consecutive data)
start_forecast = pd.to_datetime('2025-01-01')
end_forecast = start_forecast + pd.DateOffset(months=forecast_horizon - 1)

# Output the ranges
start_train = full_train['Date'].min()
end_train = full_train['Date'].max()

print(f"Train Start: {start_train.date()}")
print(f"Train End: {end_train.date()}")
print(f"Forecast Start: {start_forecast.date()}")
print(f"Forecast End: {end_forecast.date()}")

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 8, Finished, Available, Finished)

Train Start: 2015-01-01
Train End: 2024-12-01
Forecast Start: 2025-01-01
Forecast End: 2025-12-01


# Hierarchical Forecasting Implementation

Functions defining the forecasting

## Forecasting Models

We'll implement multiple forecasting approaches:
1. **ARIMA** - Auto-regressive Integrated Moving Average
2. **SARIMA** - Seasonal ARIMA with economic indicators
3. **Exponential Smoothing** - Holt-Winters method
4. **Ensemble** - Weighted combination of methods

In [7]:
def forecast_arima(series, steps=12, order=(1,1,1)):
    """
    ARIMA forecasting with automatic order selection if needed
    """
    try:
        model = ARIMA(series, order=order)
        fitted_model = model.fit()
        forecast = fitted_model.forecast(steps=steps)
        conf_int = fitted_model.get_forecast(steps=steps).conf_int()
        return forecast, conf_int, fitted_model.aic
    except:
        # Try simpler model if original fails
        try:
            model = ARIMA(series, order=(1,0,1))
            fitted_model = model.fit()
            forecast = fitted_model.forecast(steps=steps)
            conf_int = fitted_model.get_forecast(steps=steps).conf_int()
            return forecast, conf_int, fitted_model.aic
        except:
            # Last resort - simple naive forecast
            last_value = series.iloc[-1]
            forecast = pd.Series([last_value] * steps)
            conf_int = pd.DataFrame({
                'lower Total_Quantity': forecast * 0.9,
                'upper Total_Quantity': forecast * 1.1
            })
            return forecast, conf_int, float('inf')

def forecast_sarima(series, exog=None, steps=12, order=(1,1,1), seasonal_order=(1,1,1,12)):
    """
    SARIMA forecasting with external regressors
    """
    try:
        model = SARIMAX(series, exog=exog, order=order, seasonal_order=seasonal_order)
        fitted_model = model.fit(disp=False)
        
        # For forecast, we need future exogenous variables
        # Use last known values as a simple assumption
        if exog is not None:
            future_exog = pd.DataFrame([exog.iloc[-1]] * steps)
            future_exog.index = pd.date_range(start=exog.index[-1] + pd.DateOffset(months=1), periods=steps, freq='MS')
        else:
            future_exog = None
            
        forecast = fitted_model.forecast(steps=steps, exog=future_exog)
        conf_int = fitted_model.get_forecast(steps=steps, exog=future_exog).conf_int()
        return forecast, conf_int, fitted_model.aic
    except:
        # Fallback to simple ARIMA
        return forecast_arima(series, steps, order)

def forecast_exponential_smoothing(series, steps=12, seasonal_periods=12):
    """
    Exponential Smoothing (Holt-Winters) forecasting
    """
    try:
        if len(series) >= 2 * seasonal_periods:
            model = ExponentialSmoothing(series, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
        else:
            model = ExponentialSmoothing(series, trend='add', seasonal=None)
        
        fitted_model = model.fit()
        forecast = fitted_model.forecast(steps=steps)
        
        # Simple confidence intervals based on residuals
        residuals = fitted_model.resid
        std_resid = residuals.std()
        conf_int = pd.DataFrame({
            'lower Total_Quantity': forecast - 1.96 * std_resid,
            'upper Total_Quantity': forecast + 1.96 * std_resid
        })
        
        return forecast, conf_int, fitted_model.aic
    except:
        # Fallback to ARIMA
        return forecast_arima(series, steps)

def ensemble_forecast(forecasts, aics=None):
    """
    Create ensemble forecast from multiple methods (weighted by inverse AIC)
    """
    weights = []

    if aics is None:
        weights = [1/len(forecasts)] * len(forecasts)
    else:
        weights = [1/aic if aic != float('inf') else 0 for aic in aics]
        total_weight = sum(weights)
        if total_weight > 0:
            weights = [w/total_weight for w in weights]
        else:
            weights = [1/3, 1/3, 1/3]

    print(f"Model weights - ARIMA: {weights[0]:.3f}, SARIMA: {weights[1]:.3f}, EXP: {weights[2]:.3f}")
    
    ensemble = sum(f * w for f, w in zip(forecasts, weights))
    return ensemble

print("Forecasting functions defined successfully!")

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 9, Finished, Available, Finished)

Forecasting functions defined successfully!


## Hierarchical Forecasting Functions

In [8]:
# def forecast_overall(data, exog_vars, forecast_steps, forecast_dates):
#     """
#     Generate overall forecasts using multiple methods and ensemble approach
    
#     Parameters:
#     - data: DataFrame with overall time series data
#     - exog_vars: List of exogenous variables to use in forecasting
#     - forecast_steps: Number of steps to forecast
#     - forecast_dates: Date range for forecasts
    
#     Returns:
#     - DataFrame with all forecast methods and ensemble result
#     """
#     print("=== LEVEL 0: OVERALL FORECASTING ===")
    
#     # Prepare overall data
#     overall_series = data.set_index('Date')['Total_Quantity']
#     overall_exog = data.set_index('Date')[exog_vars]
    
#     # Generate forecasts using different methods
#     print("Generating ARIMA forecast...")
#     overall_arima_forecast, overall_arima_conf, overall_arima_aic = forecast_arima(overall_series, forecast_steps)
    
#     print("Generating SARIMA forecast...")
#     overall_sarima_forecast, overall_sarima_conf, overall_sarima_aic = forecast_sarima(overall_series, overall_exog, forecast_steps)
    
#     print("Generating Exponential Smoothing forecast...")
#     overall_exp_forecast, overall_exp_conf, overall_exp_aic = forecast_exponential_smoothing(overall_series, forecast_steps)
    
#     # Create ensemble forecast
#     aics = [overall_arima_aic, overall_sarima_aic, overall_exp_aic]    
#     overall_ensemble_forecast = ensemble_forecast(
#         [overall_arima_forecast, overall_sarima_forecast, overall_exp_forecast], 
#         aics
#     )
    
#     # Store overall forecasts
#     overall_forecasts = pd.DataFrame({
#         'Date': forecast_dates,
#         'ARIMA': overall_arima_forecast.values,
#         'SARIMA': overall_sarima_forecast.values,
#         'ExpSmoothing': overall_exp_forecast.values,
#         'Ensemble': overall_ensemble_forecast.values,
#         'Level': 'Overall',
#         'Segment': 'Total'
#     })
    
#     print(f"Overall forecast range: {overall_ensemble_forecast.min():,.0f} - {overall_ensemble_forecast.max():,.0f}")
    
#     return overall_forecasts

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 10, Finished, Available, Finished)

In [9]:
# def forecast_hierarchical_level(data, level_column, level_name, exog_vars, forecast_steps, forecast_dates):
#     """
#     Generic function to forecast at any hierarchical level (segments or subcategories)
    
#     Parameters:
#     - data: DataFrame with the time series data for the level
#     - level_column: Column name that contains the grouping variable (e.g., 'CustomerSegment', 'SubCategoryName')
#     - level_name: Name for the level (e.g., 'CustomerSegment', 'SubCategoryName')
#     - exog_vars: List of exogenous variables to use in forecasting
#     - forecast_steps: Number of steps to forecast
#     - forecast_dates: Date range for forecasts
    
#     Returns:
#     - DataFrame with forecasts for all groups in the level
#     """
#     print(f"\n=== {level_name.upper()} FORECASTING ===")
    
#     forecasts_list = []
    
#     for group in data[level_column].unique():
#         print(f"\nForecasting for {level_name.lower()}: {group}")
        
#         # Filter data for this group
#         group_data = data[data[level_column] == group].set_index('Date')
#         group_series = group_data['Total_Quantity']
#         group_exog = group_data[exog_vars]
        
#         if len(group_series) < 3:  # Need minimum data points
#             print(f"  Insufficient data for {group}, using naive forecast")
#             group_ensemble = pd.Series([group_series.mean()] * forecast_steps)
#         else:
#             # Generate forecasts
#             arima_forecast, _, arima_aic = forecast_arima(group_series, forecast_steps)
#             sarima_forecast, _, sarima_aic = forecast_sarima(group_series, group_exog, forecast_steps)
#             exp_forecast, _, exp_aic = forecast_exponential_smoothing(group_series, forecast_steps)
            
#             # Create ensemble
#             aics = [arima_aic, sarima_aic, exp_aic]
#             group_ensemble = ensemble_forecast(
#                 [arima_forecast, sarima_forecast, exp_forecast],
#                 aics
#             )
        
#         # Store forecast
#         forecast_df = pd.DataFrame({
#             'Date': forecast_dates,
#             'Ensemble': group_ensemble.values,
#             'Level': level_name,
#             'Segment': group,
#             level_column: group
#         })
        
#         forecasts_list.append(forecast_df)
#         print(f"  Forecast range: {group_ensemble.min():,.0f} - {group_ensemble.max():,.0f}")
    
#     # Combine all forecasts
#     combined_forecasts = pd.concat(forecasts_list, ignore_index=True)
    
#     print(f"\nTotal {level_name.lower()} forecasts generated: {len(forecasts_list)}")
#     print(f"{level_name} forecast total range: {combined_forecasts['Ensemble'].min():,.0f} - {combined_forecasts['Ensemble'].max():,.0f}")
    
#     return combined_forecasts

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 11, Finished, Available, Finished)

In [10]:
# def hierarchical_reconciliation(overall_forecasts, segment_forecasts, subcategory_forecasts, 
#                                segment_ts, subcategory_ts, forecast_dates):
#     """
#     Perform top-down hierarchical reconciliation to ensure forecast consistency
    
#     Parameters:
#     - overall_forecasts: DataFrame with overall level forecasts
#     - segment_forecasts: DataFrame with customer segment forecasts
#     - subcategory_forecasts: DataFrame with subcategory forecasts
#     - segment_ts: Historical segment time series data
#     - subcategory_ts: Historical subcategory time series data
#     - forecast_dates: Date range for forecasts
    
#     Returns:
#     - Tuple of (reconciled_segment_forecasts, reconciled_subcategory_forecasts)
#     """
#     print("=== HIERARCHICAL RECONCILIATION ===")
    
#     # Check consistency before reconciliation
#     print("\nBefore Reconciliation:")
#     for date in forecast_dates[:3]:  # Check first 3 dates
#         overall_val = overall_forecasts[overall_forecasts['Date'] == date]['Ensemble'].iloc[0]
#         segment_sum = segment_forecasts[segment_forecasts['Date'] == date]['Ensemble'].sum()
#         subcat_sum = subcategory_forecasts[subcategory_forecasts['Date'] == date]['Ensemble'].sum()
        
#         print(f"  {date.strftime('%Y-%m')}: Overall={overall_val:,.0f}, Segments Sum={segment_sum:,.0f}, SubCats Sum={subcat_sum:,.0f}")
    
#     # Calculate historical proportions for reconciliation
#     print("\nCalculating historical proportions...")
    
#     # Customer segment proportions
#     segment_props = {}
#     for segment in segment_ts['CustomerSegment'].unique():
#         segment_total = segment_ts[segment_ts['CustomerSegment'] == segment]['Total_Quantity'].sum()
#         overall_total = segment_ts['Total_Quantity'].sum()
#         segment_props[segment] = segment_total / overall_total
    
#     # SubCategory proportions
#     subcat_props = {}
#     for subcat in subcategory_ts['SubCategoryName'].unique():
#         subcat_total = subcategory_ts[subcategory_ts['SubCategoryName'] == subcat]['Total_Quantity'].sum()
#         overall_total = subcategory_ts['Total_Quantity'].sum()
#         subcat_props[subcat] = subcat_total / overall_total
    
#     print(f"Customer Segment Proportions: {segment_props}")
#     print(f"SubCategory Proportions: {subcat_props}")
    
#     # Apply top-down reconciliation
#     print("\nApplying top-down reconciliation...")
    
#     # Reconcile segment forecasts
#     segment_forecasts_reconciled = segment_forecasts.copy()
#     for idx, row in segment_forecasts_reconciled.iterrows():
#         overall_val = overall_forecasts[overall_forecasts['Date'] == row['Date']]['Ensemble'].iloc[0]
#         segment_forecasts_reconciled.loc[idx, 'Ensemble_Reconciled'] = overall_val * segment_props[row['CustomerSegment']]
    
#     # Reconcile subcategory forecasts
#     subcategory_forecasts_reconciled = subcategory_forecasts.copy()
#     for idx, row in subcategory_forecasts_reconciled.iterrows():
#         overall_val = overall_forecasts[overall_forecasts['Date'] == row['Date']]['Ensemble'].iloc[0]
#         subcategory_forecasts_reconciled.loc[idx, 'Ensemble_Reconciled'] = overall_val * subcat_props[row['SubCategoryName']]
    
#     # Verify reconciliation
#     print("\nAfter Reconciliation:")
#     for date in forecast_dates[:3]:
#         overall_val = overall_forecasts[overall_forecasts['Date'] == date]['Ensemble'].iloc[0]
#         segment_sum = segment_forecasts_reconciled[segment_forecasts_reconciled['Date'] == date]['Ensemble_Reconciled'].sum()
#         subcat_sum = subcategory_forecasts_reconciled[subcategory_forecasts_reconciled['Date'] == date]['Ensemble_Reconciled'].sum()
        
#         print(f"  {date.strftime('%Y-%m')}: Overall={overall_val:,.0f}, Segments Sum={segment_sum:,.0f}, SubCats Sum={subcat_sum:,.0f}")
    
#     print("\nReconciliation completed!")
    
#     return segment_forecasts_reconciled, subcategory_forecasts_reconciled

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 12, Finished, Available, Finished)

# Running Hierarchical Forecasting

In [11]:
# # Set forecasting parameters
# FORECAST_STEPS = 3
# START_DATE = pd.to_datetime("2024-01-01")
# FORECAST_DATES = pd.date_range(start=START_DATE, periods=FORECAST_STEPS, freq='MS')

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 13, Finished, Available, Finished)

In [12]:
# # Generate overall forecasts using the reusable function
# overall_forecasts = forecast_overall(
#     data=overall_train,
#     exog_vars=exog_vars,
#     forecast_steps=len(overall_test),
#     forecast_dates=overall_test['Date'].unique()
# )

# overall_forecasts.head()

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 14, Finished, Available, Finished)

In [13]:
# # === FORECAST CUSTOMER SEGMENT LEVEL ===
# segment_forecasts = forecast_hierarchical_level(
#     data=segment_train,
#     level_column='CustomerSegment',
#     level_name='CustomerSegment',
#     exog_vars=exog_vars,
#     forecast_steps=len(segment_test['Date'].unique()),
#     forecast_dates=segment_test['Date'].unique()
# )

# segment_forecasts.head()

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 15, Finished, Available, Finished)

In [14]:
# # === FORECAST SUBCATEGORY LEVEL ===
# subcategory_forecasts = forecast_hierarchical_level(
#     data=subcategory_train,
#     level_column='SubCategoryName',
#     level_name='SubCategoryName',
#     exog_vars=exog_vars,
#     forecast_steps=len(subcategory_test['Date'].unique()),
#     forecast_dates=subcategory_test['Date'].unique()
# )

# subcategory_forecasts.head()

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 16, Finished, Available, Finished)

## Hierarchical Consistency & Reconciliation

We need to ensure that the sum of forecasts at lower levels equals the forecast at higher levels. This is called hierarchical reconciliation.

In [15]:
# # Apply hierarchical reconciliation using the reusable function
# segment_forecasts_reconciled, subcategory_forecasts_reconciled = hierarchical_reconciliation(
#     overall_forecasts=overall_forecasts,
#     segment_forecasts=segment_forecasts,
#     subcategory_forecasts=subcategory_forecasts,
#     segment_ts=segment_ts,
#     subcategory_ts=subcategory_ts,
#     forecast_dates=FORECAST_DATES
# )

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 17, Finished, Available, Finished)

In [16]:
overall_series = full_train.set_index('Date')['Total_Quantity']
overall_exog = full_train.set_index('Date')[exog_vars]

# Forecasts
arima_fc, _, aic_arima = forecast_arima(overall_series, steps=forecast_horizon)
sarima_fc, _, aic_sarima = forecast_sarima(overall_series, exog=overall_exog, steps=forecast_horizon)
exp_fc, _, aic_exp = forecast_exponential_smoothing(overall_series, steps=forecast_horizon)

# Ensemble
ensemble_fc = ensemble_forecast([arima_fc, sarima_fc, exp_fc], [aic_arima, aic_sarima, aic_exp])


StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 18, Finished, Available, Finished)

Model weights - ARIMA: 0.310, SARIMA: 0.346, EXP: 0.343


In [17]:
forecast_dates = pd.date_range(start='2025-01-01', periods=12, freq='MS')

# Assign index to forecasts
arima_fc.index = forecast_dates
sarima_fc.index = forecast_dates
exp_fc.index = forecast_dates
ensemble_fc.index = forecast_dates


StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 19, Finished, Available, Finished)

In [18]:
forecast_df = pd.DataFrame({
    'Date': forecast_dates,
    'ARIMA': arima_fc.values,
    'SARIMA': sarima_fc.values,
    'ExpSmoothing': exp_fc.values,
    'Ensemble': ensemble_fc.values
})

formatted_df = forecast_df.copy()
for col in ['ARIMA', 'SARIMA', 'ExpSmoothing', 'Ensemble']:
    formatted_df[col] = formatted_df[col].apply(lambda x: f'{x:,.2f}')
print(formatted_df)


StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 20, Finished, Available, Finished)

         Date          ARIMA         SARIMA   ExpSmoothing       Ensemble
0  2025-01-01  22,131,931.24  27,904,195.97  25,914,249.40  25,429,132.48
1  2025-02-01  23,070,136.48  28,476,466.05  25,011,059.69  25,608,224.05
2  2025-03-01  23,203,122.43  30,600,977.41  27,537,890.36  27,252,783.38
3  2025-04-01  23,221,972.53  27,037,698.86  23,568,943.87  24,661,999.76
4  2025-05-01  23,224,644.43  26,320,494.45  24,084,926.68  24,591,778.77
5  2025-06-01  23,225,023.16  26,066,199.66  25,839,362.09  25,106,443.39
6  2025-07-01  23,225,076.85  32,460,195.60  27,272,689.84  27,812,102.12
7  2025-08-01  23,225,084.46  30,345,826.84  25,462,868.25  26,458,594.64
8  2025-09-01  23,225,085.53  30,538,149.13  26,672,463.93  26,940,614.33
9  2025-10-01  23,225,085.69  28,691,595.97  24,338,146.54  25,499,669.91
10 2025-11-01  23,225,085.71  27,668,164.36  23,756,337.41  24,945,570.61
11 2025-12-01  23,225,085.71  19,615,577.28  17,277,103.35  19,932,733.53


## 2025 Actuals Comparison

In [24]:
# Compute the total Quantity Invoiced for each month of 2025
# 2025 Actuals Comparison so far
print("\n=== 2025 ACTUALS COMPARISON ===")

# Loading Quantity data in pandas DataFrames
# 2015 to 2025 Qty.csv
actual_quantity_df = pd.read_csv(data_location + "forecasting/userProvidedData/2015-2025 Qty.csv", parse_dates=['Fiscal Hierarchy - Full Date'])
actual_quantity_df = actual_quantity_df[actual_quantity_df['Fiscal Hierarchy - Full Date'].dt.year == 2025]

# Ensure Quantity Invoiced is numeric
actual_quantity_df['Quantity Invoiced'] = actual_quantity_df['Quantity Invoiced'].astype(str).str.replace(',', '')
actual_quantity_df['Quantity Invoiced'] = pd.to_numeric(actual_quantity_df['Quantity Invoiced'], errors='coerce')

# # Filtering Later Quantity Data to match
# print(f"Before filtering quantity data shape: {actual_quantity_df.shape}")

# # Filter to InterCompany
# actual_quantity_df = actual_quantity_df[actual_quantity_df['CustomerSegment'] == 'InterCompany']
# print(f"After filtering to InterCompany segment, quantity data shape: {actual_quantity_df.shape}")

# Group by month and sum the Quantity Invoiced
actual_quantity_df = actual_quantity_df.groupby(actual_quantity_df['Fiscal Hierarchy - Full Date'].dt.to_period('M'))['Quantity Invoiced'].sum().reset_index()
actual_quantity_df['Fiscal Hierarchy - Full Date'] = actual_quantity_df['Fiscal Hierarchy - Full Date'].dt.to_timestamp()
# Rename columns for clarity
actual_quantity_df.rename(columns={'Fiscal Hierarchy - Full Date': 'Date', 'Quantity Invoiced': 'Actual_Quantity'}, inplace=True)

# Display the monthly actuals for 2025
display(actual_quantity_df.head())


StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 26, Finished, Available, Finished)


=== 2025 ACTUALS COMPARISON ===


SynapseWidget(Synapse.DataFrame, 19724b96-3d5d-4977-a37c-b99f7d67327e)

<DatetimeArray>
['2025-01-01 00:00:00', '2025-02-01 00:00:00', '2025-03-01 00:00:00',
 '2025-04-01 00:00:00', '2025-05-01 00:00:00', '2025-06-01 00:00:00']
Length: 6, dtype: datetime64[ns]


In [27]:
# Step 1: Align forecast and actuals
# Merge the actuals with forecast_df on Date
mape_df = pd.merge(forecast_df, actual_quantity_df, on='Date', how='inner')

# Step 2: Filter for Jan–Jun 2025 only
mape_df = mape_df[(mape_df['Date'] >= '2025-01-01') & (mape_df['Date'] <= '2025-06-01')]

# Step 3: Compute MAPE per month and model
mape_per_month = []

for _, row in mape_df.iterrows():
    date = row['Date']
    actual = row['Actual_Quantity']
    monthly_result = {'Date': date.strftime('%Y-%m')}
    for model in ['ARIMA', 'SARIMA', 'ExpSmoothing', 'Ensemble']:
        forecast = row[model]
        mape = abs((actual - forecast) / actual) * 100 if actual != 0 else None
        monthly_result[model] = round(mape, 2) if mape is not None else None
    mape_per_month.append(monthly_result)

# Convert to DataFrame for display
mape_monthly_df = pd.DataFrame(mape_per_month)

# Display results
print("\n=== Monthly MAPE (Jan–Jun 2025) ===")
print(mape_monthly_df)

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 29, Finished, Available, Finished)


=== Monthly MAPE (Jan–Jun 2025) ===
      Date  ARIMA  SARIMA  ExpSmoothing  Ensemble
0  2025-01  13.19    9.45          1.65      0.26
1  2025-02  19.29    0.38         12.50     10.42
2  2025-03   1.93   29.34         16.39     15.19
3  2025-04   3.92   11.87          2.48      2.04
4  2025-05   3.07   16.81          6.89      9.14
5  2025-06   1.49   10.56          9.60      6.49


## Forecast Visualization & Results

Let's visualize the hierarchical forecasts and compare them with historical data.

In [19]:
# # Create comprehensive forecast visualization
# fig = make_subplots(
#     rows=3, cols=1,
#     subplot_titles=('Overall Forecast', 'Customer Segment Forecasts', 'SubCategory Forecasts'),
#     vertical_spacing=0.08,
#     specs=[[{"secondary_y": False}],
#            [{"secondary_y": False}],
#            [{"secondary_y": False}]]
# )

# # Overall forecast plot
# fig.add_trace(
#     go.Scatter(x=overall_ts['Date'], y=overall_ts['Total_Quantity'],
#               mode='lines+markers', name='Historical Total',
#               line=dict(color='darkblue', width=2)),
#     row=1, col=1
# )

# fig.add_trace(
#     go.Scatter(x=overall_forecasts['Date'], y=overall_forecasts['Ensemble'],
#               mode='lines+markers', name='Overall Forecast',
#               line=dict(color='red', width=3, dash='dash')),
#     row=1, col=1
# )

# # Customer segment forecasts
# segment_colors = px.colors.qualitative.Set1
# for i, segment in enumerate(segment_ts['CustomerSegment'].unique()):
#     # Historical data
#     segment_hist = segment_ts[segment_ts['CustomerSegment'] == segment]
#     fig.add_trace(
#         go.Scatter(x=segment_hist['Date'], y=segment_hist['Total_Quantity'],
#                   mode='lines', name=f'{segment} (Historical)',
#                   line=dict(color=segment_colors[i % len(segment_colors)], width=1),
#                   showlegend=False),
#         row=2, col=1
#     )
    
#     # Forecast data
#     segment_forecast = segment_forecasts_reconciled[segment_forecasts_reconciled['CustomerSegment'] == segment]
#     fig.add_trace(
#         go.Scatter(x=segment_forecast['Date'], y=segment_forecast['Ensemble_Reconciled'],
#                   mode='lines+markers', name=f'{segment} Forecast',
#                   line=dict(color=segment_colors[i % len(segment_colors)], width=2, dash='dash')),
#         row=2, col=1
#     )

# # SubCategory forecasts
# subcat_colors = px.colors.qualitative.Set2
# for i, subcat in enumerate(subcategory_ts['SubCategoryName'].unique()):
#     # Historical data
#     subcat_hist = subcategory_ts[subcategory_ts['SubCategoryName'] == subcat]
#     fig.add_trace(
#         go.Scatter(x=subcat_hist['Date'], y=subcat_hist['Total_Quantity'],
#                   mode='lines', name=f'{subcat} (Historical)',
#                   line=dict(color=subcat_colors[i % len(subcat_colors)], width=1),
#                   showlegend=False),
#         row=3, col=1
#     )
    
#     # Forecast data
#     subcat_forecast = subcategory_forecasts_reconciled[subcategory_forecasts_reconciled['SubCategoryName'] == subcat]
#     fig.add_trace(
#         go.Scatter(x=subcat_forecast['Date'], y=subcat_forecast['Ensemble_Reconciled'],
#                   mode='lines+markers', name=f'{subcat} Forecast',
#                   line=dict(color=subcat_colors[i % len(subcat_colors)], width=2, dash='dash')),
#         row=3, col=1
#     )

# fig.update_layout(
#     height=1000,
#     title_text="Hierarchical Sales Forecasting: Historical vs Predicted",
#     showlegend=True,
#     legend=dict(orientation="v", yanchor="top", y=1, xanchor="left", x=1.01)
# )

# fig.update_xaxes(title_text="Date")
# fig.update_yaxes(title_text="Total Quantity")

# fig.show()

# # Print forecast summary
# print("\n=== FORECAST SUMMARY ===")
# print(f"Forecast Period: {FORECAST_DATES[0].strftime('%Y-%m')} to {FORECAST_DATES[-1].strftime('%Y-%m')}")
# print(f"\nOverall Forecast Summary:")
# print(f"  Mean Monthly Forecast: {overall_forecasts['Ensemble'].mean():,.0f}")
# print(f"  Total 12-Month Forecast: {overall_forecasts['Ensemble'].sum():,.0f}")
# print(f"  Min-Max Range: {overall_forecasts['Ensemble'].min():,.0f} - {overall_forecasts['Ensemble'].max():,.0f}")

# print(f"\nCustomer Segment Forecast Summary:")
# for segment in segment_forecasts_reconciled['CustomerSegment'].unique():
#     segment_data = segment_forecasts_reconciled[segment_forecasts_reconciled['CustomerSegment'] == segment]
#     total_forecast = segment_data['Ensemble_Reconciled'].sum()
#     print(f"  {segment}: {total_forecast:,.0f} (12-month total)")

# print(f"\nSubCategory Forecast Summary:")
# for subcat in subcategory_forecasts_reconciled['SubCategoryName'].unique():
#     subcat_data = subcategory_forecasts_reconciled[subcategory_forecasts_reconciled['SubCategoryName'] == subcat]
#     total_forecast = subcat_data['Ensemble_Reconciled'].sum()
#     print(f"  {subcat}: {total_forecast:,.0f} (12-month total)")

StatementMeta(, b74708c9-a8f3-4dab-9da2-63e6a804dc4a, 21, Finished, Available, Finished)