# Hierarchical Sales Forecasting: Overall, Customer Segment & SubCategory

This notebook implements grouped time series forecasting at three hierarchical levels:
1. **Overall** - Total quantity across all segments
2. **Customer Segment** - Aggregated by customer segments
3. **SubCategory** - Aggregated by product subcategories

We'll use multiple forecasting approaches and ensure hierarchical consistency.

In [None]:
import mlflow 
mlflow.set_experiment("Time_Series_V1_DS")

In [18]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Analysis run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 24, Finished, Available, Finished)

Libraries imported successfully!
Analysis run on: 2025-07-12 03:38:53


In [19]:
# Check whether running in Fabric or locally, and set the data location accordingly
if "AZURE_SERVICE" in os.environ:
    is_fabric = True
    data_location = "/lakehouse/default/Files/"
    print("Running in Fabric, setting data location to /lakehouse/default/Files/")
else:
    is_fabric = False
    data_location = ""
    print("Running locally, setting data location to current directory")

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 25, Finished, Available, Finished)

Running in Fabric, setting data location to /lakehouse/default/Files/


In [20]:
# Load the combined sales economic data
data = pd.read_csv(data_location + 'forecasting/userProvidedData/combined_sales_economic_data_segmented.csv')
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date')

print("=== DATASET OVERVIEW ===")
print(f"Dataset shape: {data.shape}")
print(f"Date range: {data['Date'].min()} to {data['Date'].max()}")
print(f"Unique Customer Segments: {data['CustomerSegment'].nunique()}")
print(f"Unique SubCategories: {data['SubCategoryName'].nunique()}")
print(f"\nCustomer Segments: {sorted(data['CustomerSegment'].unique())}")
print(f"\nSubCategories: {sorted(data['SubCategoryName'].unique())}")
print(f"\nData types:")
print(data.dtypes)
print(f"\nFirst few rows:")
data.head()
# pd.set_option('display.max_columns', 100)
# print(data.tail())
# pd.reset_option('display.max_columns')

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 26, Finished, Available, Finished)

=== DATASET OVERVIEW ===
Dataset shape: (6347, 48)
Date range: 2015-01-01 00:00:00 to 2024-12-01 00:00:00
Unique Customer Segments: 15
Unique SubCategories: 19

Customer Segments: ['B Customer - Norris', 'Brand Owners', 'Container Manufacturers', 'Distribution', 'Drum & Pail Manufacturers', 'Fillers', 'Global Strategic Accounts', 'Industrial Chemicals', 'InterCompany', 'Other', 'Pharmaceuticals & Nutraceuticals', 'Small Customers < $250k / year', 'Tier 1 Accounts', 'Tier 2 Accounts', 'Unknown Segment']

SubCategories: ['Components', 'Dispensing Pumps (10cc to 30cc)', 'Drum Faucets', 'Freight', 'Gaskets & Vents', 'Non-Product Sales', 'Other', 'Overcaps', 'Plugs', 'Press-Fit Closures', 'Proprietary Molded Solutions', 'Pull Out Spout Closures', 'Purchased', 'Screw Closures', 'Special Caps', 'Specialty Closures', 'Tools', 'Unknown SubCategory', 'Wrenches']

Data types:
Date                            datetime64[ns]
CustomerSegment                         object
SubCategoryName             

Unnamed: 0,Date,CustomerSegment,SubCategoryName,Total_Quantity,Avg_Quantity_Per_Transaction,Transaction_Count,Unique_Customers,Unique_Products,PP_Spot,Resin,...,Total_Quantity_3MA,Total_Quantity_6MA,Total_Quantity_12MA,Month,Quarter,Year,Economic_Momentum,Cost_Efficiency,Monthly_Total_Market,Market_Share_Percent
1879,2015-01-01,Drum & Pail Manufacturers,Specialty Closures,239000.0,119500.0,2,1,2,278.0,8075.0,...,,,,1,1,2015,5998.551387,3.677666,22965664.0,1.040684
2710,2015-01-01,Industrial Chemicals,Press-Fit Closures,344620.0,86155.0,4,4,3,278.0,8075.0,...,,,,1,1,2015,4855.595447,3.677666,22965664.0,1.500588
664,2015-01-01,Container Manufacturers,Specialty Closures,111600.0,55800.0,2,2,1,278.0,8075.0,...,,,,1,1,2015,5945.231637,3.677666,22965664.0,0.485943
779,2015-01-01,Distribution,Gaskets & Vents,220000.0,110000.0,2,2,1,278.0,8075.0,...,,,,1,1,2015,5934.825422,3.677666,22965664.0,0.957952
5932,2015-01-01,Tier 1 Accounts,Pull Out Spout Closures,62200.0,31100.0,2,1,2,278.0,8075.0,...,,,,1,1,2015,5720.00626,3.677666,22965664.0,0.270839


## Data Preparation for Hierarchical Forecasting

We'll create three levels of aggregation:
1. **Level 0 (Overall)**: Total quantity across all segments and subcategories
2. **Level 1 (Customer Segment)**: Aggregated by customer segment
3. **Level 2 (SubCategory)**: Aggregated by product subcategory

In [21]:
# Create hierarchical aggregations
print("=== CREATING HIERARCHICAL AGGREGATIONS ===")

# Holds columns and their aggregation functions
column_aggregations = {
    'Total_Quantity': 'sum',
    'Transaction_Count': 'sum',
    'Unique_Customers': 'sum',
    'Unique_Products': 'sum',
    # Economic indicators (take mean as they're external factors)
    'PP_Spot': 'mean',
    'Resin': 'mean',
    'WTI_Crude_Oil': 'mean',
    'Natural_Gas': 'mean',
    'Electricity Price': 'mean',
    'Gas Price': 'mean',
    'Energy_Average': 'mean',
    'PPI_Freight': 'mean',
    'PMI_Data': 'mean',
    'Factory_Utilization': 'mean',
    'Capacity_Utilization': 'mean',
    'Beverage': 'mean', # Additional economic indicator
    'Household_consumption': 'mean',
    'packaging': 'mean',
    'Diesel': 'mean',
    'PPI_Delivery': 'mean',
    'Oil-to-resin': 'mean',
    'Electricity Price (Lag6)': 'mean',
    'Gas Price (Lag6)': 'mean'
}

# Define exogenous variables for modeling
exog_vars = [
    'PP_Spot',
    'Resin',
    'PMI_Data',
    'Natural_Gas',
    'WTI_Crude_Oil',
    'Factory_Utilization',
    'packaging',
    'Energy_Average',
    'Electricity Price (Lag6)',
    'Gas Price (Lag6)'
]

    # 'PPI_Delivery' slightly negative
    # 'PMI_Data', major positive
    # 'PPI_Freight', negative
    # 'Factory_Utilization',
    # 'Capacity_Utilization', negative
    # 'Beverage', minor negative
    # 'Household_consumption', major negative
    # 'packaging' minor positive
    # 'Diesel', minor positive
    # 'Natural_Gas' major positive
    # 'Electricity Price (Lag6)', positive
    # 'Gas Price (Lag6)' positive


# Level 0: Overall aggregation (sum across all segments and subcategories)
overall_ts = data.groupby('Date').agg(column_aggregations).reset_index()
overall_ts['Level'] = 'Overall'
overall_ts['Segment'] = 'Total'

# Level 1: Customer Segment aggregation
segment_ts = data.groupby(['Date', 'CustomerSegment']).agg(column_aggregations).reset_index()
segment_ts['Level'] = 'CustomerSegment'
segment_ts['Segment'] = segment_ts['CustomerSegment']

# Level 2: SubCategory aggregation
subcategory_ts = data.groupby(['Date', 'SubCategoryName']).agg(column_aggregations).reset_index()
subcategory_ts['Level'] = 'SubCategoryName'
subcategory_ts['Segment'] = subcategory_ts['SubCategoryName']

print(f"Overall time series shape: {overall_ts.shape}")
print(f"Customer segment time series shape: {segment_ts.shape}")
print(f"SubCategory time series shape: {subcategory_ts.shape}")

# Display summary statistics
print("\n=== LEVEL SUMMARY ===")
print(f"Overall total quantity range: {overall_ts['Total_Quantity'].min():,.0f} - {overall_ts['Total_Quantity'].max():,.0f}")
print(f"Customer segments: {segment_ts['CustomerSegment'].unique()}")
print(f"SubCategories: {subcategory_ts['SubCategoryName'].unique()}")

print(overall_ts.head())


StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 27, Finished, Available, Finished)

=== CREATING HIERARCHICAL AGGREGATIONS ===
Overall time series shape: (120, 26)
Customer segment time series shape: (1444, 27)
SubCategory time series shape: (1663, 27)

=== LEVEL SUMMARY ===
Overall total quantity range: 12,565,334 - 33,525,018
Customer segments: ['B Customer - Norris' 'Brand Owners' 'Container Manufacturers'
 'Distribution' 'Drum & Pail Manufacturers' 'Fillers'
 'Industrial Chemicals' 'InterCompany' 'Other'
 'Pharmaceuticals & Nutraceuticals' 'Small Customers < $250k / year'
 'Tier 1 Accounts' 'Unknown Segment' 'Global Strategic Accounts'
 'Tier 2 Accounts']
SubCategories: ['Dispensing Pumps (10cc to 30cc)' 'Drum Faucets' 'Freight'
 'Gaskets & Vents' 'Non-Product Sales' 'Other' 'Overcaps' 'Plugs'
 'Press-Fit Closures' 'Pull Out Spout Closures' 'Purchased'
 'Screw Closures' 'Specialty Closures' 'Tools' 'Wrenches' 'Components'
 'Unknown SubCategory' 'Proprietary Molded Solutions' 'Special Caps']
        Date  Total_Quantity  Transaction_Count  Unique_Customers  \
0 201

In [22]:
# === SPLIT DATA INTO TRAIN/TEST BASED ON TIME === USING 10 YEAR'S DATA

# Define the cutoff date (e.g., use last 12 months as test)
cutoff_date = overall_ts['Date'].max() - pd.DateOffset(months=12)

# Split overall data
overall_train = overall_ts[overall_ts['Date'] <= cutoff_date].copy()
overall_test = overall_ts[overall_ts['Date'] > cutoff_date].copy()

# Split customer segment data
segment_train = segment_ts[segment_ts['Date'] <= cutoff_date].copy()
segment_test = segment_ts[segment_ts['Date'] > cutoff_date].copy()

# Split subcategory data
subcategory_train = subcategory_ts[subcategory_ts['Date'] <= cutoff_date].copy()
subcategory_test = subcategory_ts[subcategory_ts['Date'] > cutoff_date].copy()

print("Training period:", overall_train['Date'].min(), "to", overall_train['Date'].max())
print("Testing period:", overall_test['Date'].min(), "to", overall_test['Date'].max())

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 28, Finished, Available, Finished)

Training period: 2015-01-01 00:00:00 to 2023-12-01 00:00:00
Testing period: 2024-01-01 00:00:00 to 2024-12-01 00:00:00


# Hierarchical Forecasting Implementation

Functions defining the forecasting

## Forecasting Models

We'll implement multiple forecasting approaches:
1. **ARIMA** - Auto-regressive Integrated Moving Average
2. **SARIMA** - Seasonal ARIMA with economic indicators
3. **Exponential Smoothing** - Holt-Winters method
4. **Ensemble** - Weighted combination of methods

In [23]:
def forecast_arima(series, steps=12, order=(1,1,1)):
    """
    ARIMA forecasting with automatic order selection if needed
    """
    try:
        model = ARIMA(series, order=order)
        fitted_model = model.fit()
        forecast = fitted_model.forecast(steps=steps)
        conf_int = fitted_model.get_forecast(steps=steps).conf_int()
        return forecast, conf_int, fitted_model.aic
    except:
        # Try simpler model if original fails
        try:
            model = ARIMA(series, order=(1,0,1))
            fitted_model = model.fit()
            forecast = fitted_model.forecast(steps=steps)
            conf_int = fitted_model.get_forecast(steps=steps).conf_int()
            return forecast, conf_int, fitted_model.aic
        except:
            # Last resort - simple naive forecast
            last_value = series.iloc[-1]
            forecast = pd.Series([last_value] * steps)
            conf_int = pd.DataFrame({
                'lower Total_Quantity': forecast * 0.9,
                'upper Total_Quantity': forecast * 1.1
            })
            return forecast, conf_int, float('inf')

def forecast_sarima(series, exog=None, steps=12, order=(1,1,1), seasonal_order=(1,1,1,12)):
    """
    SARIMA forecasting with external regressors
    """
    try:
        model = SARIMAX(series, exog=exog, order=order, seasonal_order=seasonal_order)
        fitted_model = model.fit(disp=False)
        
        # For forecast, we need future exogenous variables
        # Use last known values as a simple assumption
        if exog is not None:
            future_exog = pd.DataFrame([exog.iloc[-1]] * steps)
            future_exog.index = pd.date_range(start=exog.index[-1] + pd.DateOffset(months=1), periods=steps, freq='MS')
        else:
            future_exog = None
            
        forecast = fitted_model.forecast(steps=steps, exog=future_exog)
        conf_int = fitted_model.get_forecast(steps=steps, exog=future_exog).conf_int()
        return forecast, conf_int, fitted_model.aic
    except:
        # Fallback to simple ARIMA
        return forecast_arima(series, steps, order)

def forecast_exponential_smoothing(series, steps=12, seasonal_periods=12):
    """
    Exponential Smoothing (Holt-Winters) forecasting
    """
    try:
        if len(series) >= 2 * seasonal_periods:
            model = ExponentialSmoothing(series, trend='add', seasonal='add', seasonal_periods=seasonal_periods)
        else:
            model = ExponentialSmoothing(series, trend='add', seasonal=None)
        
        fitted_model = model.fit()
        forecast = fitted_model.forecast(steps=steps)
        
        # Simple confidence intervals based on residuals
        residuals = fitted_model.resid
        std_resid = residuals.std()
        conf_int = pd.DataFrame({
            'lower Total_Quantity': forecast - 1.96 * std_resid,
            'upper Total_Quantity': forecast + 1.96 * std_resid
        })
        
        return forecast, conf_int, fitted_model.aic
    except:
        # Fallback to ARIMA
        return forecast_arima(series, steps)

def ensemble_forecast(forecasts, aics=None):
    """
    Create ensemble forecast from multiple methods (weighted by inverse AIC)
    """
    weights = []

    if aics is None:
        weights = [1/len(forecasts)] * len(forecasts)
    else:
        weights = [1/aic if aic != float('inf') else 0 for aic in aics]
        total_weight = sum(weights)
        if total_weight > 0:
            weights = [w/total_weight for w in weights]
        else:
            weights = [1/3, 1/3, 1/3]

    print(f"Model weights - ARIMA: {weights[0]:.3f}, SARIMA: {weights[1]:.3f}, EXP: {weights[2]:.3f}")
    
    ensemble = sum(f * w for f, w in zip(forecasts, weights))
    return ensemble

print("Forecasting functions defined successfully!")

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 29, Finished, Available, Finished)

Forecasting functions defined successfully!


## Hierarchical Forecasting Functions

In [24]:
def forecast_overall(data, exog_vars, forecast_steps, forecast_dates):
    """
    Generate overall forecasts using multiple methods and ensemble approach
    
    Parameters:
    - data: DataFrame with overall time series data
    - exog_vars: List of exogenous variables to use in forecasting
    - forecast_steps: Number of steps to forecast
    - forecast_dates: Date range for forecasts
    
    Returns:
    - DataFrame with all forecast methods and ensemble result
    """
    print("=== LEVEL 0: OVERALL FORECASTING ===")
    
    # Prepare overall data
    overall_series = data.set_index('Date')['Total_Quantity']
    overall_exog = data.set_index('Date')[exog_vars]
    
    # Generate forecasts using different methods
    print("Generating ARIMA forecast...")
    overall_arima_forecast, overall_arima_conf, overall_arima_aic = forecast_arima(overall_series, forecast_steps)
    
    print("Generating SARIMA forecast...")
    overall_sarima_forecast, overall_sarima_conf, overall_sarima_aic = forecast_sarima(overall_series, overall_exog, forecast_steps)
    
    print("Generating Exponential Smoothing forecast...")
    overall_exp_forecast, overall_exp_conf, overall_exp_aic = forecast_exponential_smoothing(overall_series, forecast_steps)
    
    # Create ensemble forecast
    aics = [overall_arima_aic, overall_sarima_aic, overall_exp_aic]    
    overall_ensemble_forecast = ensemble_forecast(
        [overall_arima_forecast, overall_sarima_forecast, overall_exp_forecast], 
        aics
    )
    
    # Store overall forecasts
    overall_forecasts = pd.DataFrame({
        'Date': forecast_dates,
        'ARIMA': overall_arima_forecast.values,
        'SARIMA': overall_sarima_forecast.values,
        'ExpSmoothing': overall_exp_forecast.values,
        'Ensemble': overall_ensemble_forecast.values,
        'Level': 'Overall',
        'Segment': 'Total'
    })
    
    print(f"Overall forecast range: {overall_ensemble_forecast.min():,.0f} - {overall_ensemble_forecast.max():,.0f}")
    
    return overall_forecasts

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 30, Finished, Available, Finished)

In [25]:
def forecast_hierarchical_level(data, level_column, level_name, exog_vars, forecast_steps, forecast_dates):
    """
    Generic function to forecast at any hierarchical level (segments or subcategories)
    
    Parameters:
    - data: DataFrame with the time series data for the level
    - level_column: Column name that contains the grouping variable (e.g., 'CustomerSegment', 'SubCategoryName')
    - level_name: Name for the level (e.g., 'CustomerSegment', 'SubCategoryName')
    - exog_vars: List of exogenous variables to use in forecasting
    - forecast_steps: Number of steps to forecast
    - forecast_dates: Date range for forecasts
    
    Returns:
    - DataFrame with forecasts for all groups in the level
    """
    print(f"\n=== {level_name.upper()} FORECASTING ===")
    
    forecasts_list = []
    
    for group in data[level_column].unique():
        print(f"\nForecasting for {level_name.lower()}: {group}")
        
        # Filter data for this group
        group_data = data[data[level_column] == group].set_index('Date')
        group_series = group_data['Total_Quantity']
        group_exog = group_data[exog_vars]
        
        if len(group_series) < 3:  # Need minimum data points
            print(f"  Insufficient data for {group}, using naive forecast")
            group_ensemble = pd.Series([group_series.mean()] * forecast_steps)
        else:
            # Generate forecasts
            arima_forecast, _, arima_aic = forecast_arima(group_series, forecast_steps)
            sarima_forecast, _, sarima_aic = forecast_sarima(group_series, group_exog, forecast_steps)
            exp_forecast, _, exp_aic = forecast_exponential_smoothing(group_series, forecast_steps)
            
            # Create ensemble
            aics = [arima_aic, sarima_aic, exp_aic]
            group_ensemble = ensemble_forecast(
                [arima_forecast, sarima_forecast, exp_forecast],
                aics
            )
        
        # Store forecast
        forecast_df = pd.DataFrame({
            'Date': forecast_dates,
            'Ensemble': group_ensemble.values,
            'Level': level_name,
            'Segment': group,
            level_column: group
        })
        
        forecasts_list.append(forecast_df)
        print(f"  Forecast range: {group_ensemble.min():,.0f} - {group_ensemble.max():,.0f}")
    
    # Combine all forecasts
    combined_forecasts = pd.concat(forecasts_list, ignore_index=True)
    
    print(f"\nTotal {level_name.lower()} forecasts generated: {len(forecasts_list)}")
    print(f"{level_name} forecast total range: {combined_forecasts['Ensemble'].min():,.0f} - {combined_forecasts['Ensemble'].max():,.0f}")
    
    return combined_forecasts

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 31, Finished, Available, Finished)

In [26]:
def hierarchical_reconciliation(overall_forecasts, segment_forecasts, subcategory_forecasts, 
                               segment_ts, subcategory_ts, forecast_dates):
    """
    Perform top-down hierarchical reconciliation to ensure forecast consistency
    
    Parameters:
    - overall_forecasts: DataFrame with overall level forecasts
    - segment_forecasts: DataFrame with customer segment forecasts
    - subcategory_forecasts: DataFrame with subcategory forecasts
    - segment_ts: Historical segment time series data
    - subcategory_ts: Historical subcategory time series data
    - forecast_dates: Date range for forecasts
    
    Returns:
    - Tuple of (reconciled_segment_forecasts, reconciled_subcategory_forecasts)
    """
    print("=== HIERARCHICAL RECONCILIATION ===")
    
    # Check consistency before reconciliation
    print("\nBefore Reconciliation:")
    for date in forecast_dates[:3]:  # Check first 3 dates
        overall_val = overall_forecasts[overall_forecasts['Date'] == date]['Ensemble'].iloc[0]
        segment_sum = segment_forecasts[segment_forecasts['Date'] == date]['Ensemble'].sum()
        subcat_sum = subcategory_forecasts[subcategory_forecasts['Date'] == date]['Ensemble'].sum()
        
        print(f"  {date.strftime('%Y-%m')}: Overall={overall_val:,.0f}, Segments Sum={segment_sum:,.0f}, SubCats Sum={subcat_sum:,.0f}")
    
    # Calculate historical proportions for reconciliation
    print("\nCalculating historical proportions...")
    
    # Customer segment proportions
    segment_props = {}
    for segment in segment_ts['CustomerSegment'].unique():
        segment_total = segment_ts[segment_ts['CustomerSegment'] == segment]['Total_Quantity'].sum()
        overall_total = segment_ts['Total_Quantity'].sum()
        segment_props[segment] = segment_total / overall_total
    
    # SubCategory proportions
    subcat_props = {}
    for subcat in subcategory_ts['SubCategoryName'].unique():
        subcat_total = subcategory_ts[subcategory_ts['SubCategoryName'] == subcat]['Total_Quantity'].sum()
        overall_total = subcategory_ts['Total_Quantity'].sum()
        subcat_props[subcat] = subcat_total / overall_total
    
    print(f"Customer Segment Proportions: {segment_props}")
    print(f"SubCategory Proportions: {subcat_props}")
    
    # Apply top-down reconciliation
    print("\nApplying top-down reconciliation...")
    
    # Reconcile segment forecasts
    segment_forecasts_reconciled = segment_forecasts.copy()
    for idx, row in segment_forecasts_reconciled.iterrows():
        overall_val = overall_forecasts[overall_forecasts['Date'] == row['Date']]['Ensemble'].iloc[0]
        segment_forecasts_reconciled.loc[idx, 'Ensemble_Reconciled'] = overall_val * segment_props[row['CustomerSegment']]
    
    # Reconcile subcategory forecasts
    subcategory_forecasts_reconciled = subcategory_forecasts.copy()
    for idx, row in subcategory_forecasts_reconciled.iterrows():
        overall_val = overall_forecasts[overall_forecasts['Date'] == row['Date']]['Ensemble'].iloc[0]
        subcategory_forecasts_reconciled.loc[idx, 'Ensemble_Reconciled'] = overall_val * subcat_props[row['SubCategoryName']]
    
    # Verify reconciliation
    print("\nAfter Reconciliation:")
    for date in forecast_dates[:3]:
        overall_val = overall_forecasts[overall_forecasts['Date'] == date]['Ensemble'].iloc[0]
        segment_sum = segment_forecasts_reconciled[segment_forecasts_reconciled['Date'] == date]['Ensemble_Reconciled'].sum()
        subcat_sum = subcategory_forecasts_reconciled[subcategory_forecasts_reconciled['Date'] == date]['Ensemble_Reconciled'].sum()
        
        print(f"  {date.strftime('%Y-%m')}: Overall={overall_val:,.0f}, Segments Sum={segment_sum:,.0f}, SubCats Sum={subcat_sum:,.0f}")
    
    print("\nReconciliation completed!")
    
    return segment_forecasts_reconciled, subcategory_forecasts_reconciled

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 32, Finished, Available, Finished)

# Running Hierarchical Forecasting

In [27]:
# Set forecasting parameters
FORECAST_STEPS = 12  # 12 months ahead
START_DATE = overall_train['Date'].max() + pd.DateOffset(months=1)
FORECAST_DATES = pd.date_range(start=START_DATE, periods=FORECAST_STEPS, freq='MS')

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 33, Finished, Available, Finished)

In [28]:
# Generate overall forecasts using the reusable function
overall_forecasts = forecast_overall(
    data=overall_train,
    exog_vars=exog_vars,
    forecast_steps=len(overall_test),
    forecast_dates=overall_test['Date'].unique()
)

overall_forecasts.head()

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 34, Finished, Available, Finished)

=== LEVEL 0: OVERALL FORECASTING ===
Generating ARIMA forecast...


Generating SARIMA forecast...


Generating Exponential Smoothing forecast...


Model weights - ARIMA: 0.309, SARIMA: 0.350, EXP: 0.342
Overall forecast range: 15,586,836 - 23,453,575


Unnamed: 0,Date,ARIMA,SARIMA,ExpSmoothing,Ensemble,Level,Segment
0,2024-01-01,19542080.0,25764760.0,20622850.0,22086740.0,Overall,Total
1,2024-02-01,20301010.0,24621880.0,19363040.0,21490910.0,Overall,Total
2,2024-03-01,20383570.0,27196590.0,22397680.0,23453580.0,Overall,Total
3,2024-04-01,20392550.0,22721480.0,17616430.0,20257910.0,Overall,Total
4,2024-05-01,20393520.0,24662110.0,18633400.0,21284170.0,Overall,Total


In [29]:
# === FORECAST CUSTOMER SEGMENT LEVEL ===
segment_forecasts = forecast_hierarchical_level(
    data=segment_train,
    level_column='CustomerSegment',
    level_name='CustomerSegment',
    exog_vars=exog_vars,
    forecast_steps=len(segment_test['Date'].unique()),
    forecast_dates=segment_test['Date'].unique()
)

segment_forecasts.head()

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 35, Finished, Available, Finished)


=== CUSTOMERSEGMENT FORECASTING ===

Forecasting for customersegment: B Customer - Norris


Model weights - ARIMA: 0.309, SARIMA: 0.349, EXP: 0.342
  Forecast range: -17,054 - 13,074

Forecasting for customersegment: Brand Owners


Model weights - ARIMA: 0.313, SARIMA: 0.343, EXP: 0.343
  Forecast range: 12,517 - 162,385

Forecasting for customersegment: Container Manufacturers


Model weights - ARIMA: 0.314, SARIMA: 0.344, EXP: 0.342
  Forecast range: 48,072 - 394,032

Forecasting for customersegment: Distribution


Model weights - ARIMA: 0.315, SARIMA: 0.341, EXP: 0.344
  Forecast range: 390,167 - 996,395

Forecasting for customersegment: Drum & Pail Manufacturers


Model weights - ARIMA: 0.315, SARIMA: 0.342, EXP: 0.343
  Forecast range: 70,403 - 1,215,405

Forecasting for customersegment: Fillers


Model weights - ARIMA: 0.314, SARIMA: 0.345, EXP: 0.341
  Forecast range: 123,256 - 737,570

Forecasting for customersegment: Industrial Chemicals


Model weights - ARIMA: 0.312, SARIMA: 0.345, EXP: 0.342
  Forecast range: 2,399,199 - 4,054,639

Forecasting for customersegment: InterCompany


Model weights - ARIMA: 0.314, SARIMA: 0.343, EXP: 0.343
  Forecast range: 6,911,720 - 10,911,907

Forecasting for customersegment: Other


Model weights - ARIMA: 0.308, SARIMA: 0.355, EXP: 0.336
  Forecast range: 225,778 - 469,892

Forecasting for customersegment: Pharmaceuticals & Nutraceuticals


Model weights - ARIMA: 0.314, SARIMA: 0.342, EXP: 0.344
  Forecast range: 80,880 - 189,936

Forecasting for customersegment: Small Customers < $250k / year


Model weights - ARIMA: 0.311, SARIMA: 0.348, EXP: 0.341
  Forecast range: 3,065,560 - 4,156,180

Forecasting for customersegment: Tier 1 Accounts


Model weights - ARIMA: 0.314, SARIMA: 0.345, EXP: 0.341
  Forecast range: 626,427 - 1,019,163

Forecasting for customersegment: Unknown Segment


Model weights - ARIMA: 0.290, SARIMA: 0.395, EXP: 0.315
  Forecast range: -22,441 - 343,332

Forecasting for customersegment: Global Strategic Accounts


Model weights - ARIMA: 0.275, SARIMA: 0.431, EXP: 0.294
  Forecast range: 8,446 - 50,807

Forecasting for customersegment: Tier 2 Accounts


Model weights - ARIMA: 0.095, SARIMA: 0.808, EXP: 0.097
  Forecast range: 353,212 - 920,446

Total customersegment forecasts generated: 15
CustomerSegment forecast total range: -22,441 - 10,911,907


Unnamed: 0,Date,Ensemble,Level,Segment,CustomerSegment
0,2024-01-01,-1949.629735,CustomerSegment,B Customer - Norris,B Customer - Norris
1,2024-02-01,-14912.317284,CustomerSegment,B Customer - Norris,B Customer - Norris
2,2024-03-01,1504.703408,CustomerSegment,B Customer - Norris,B Customer - Norris
3,2024-04-01,-5645.141248,CustomerSegment,B Customer - Norris,B Customer - Norris
4,2024-05-01,-17053.570992,CustomerSegment,B Customer - Norris,B Customer - Norris


In [30]:
# === FORECAST SUBCATEGORY LEVEL ===
subcategory_forecasts = forecast_hierarchical_level(
    data=subcategory_train,
    level_column='SubCategoryName',
    level_name='SubCategoryName',
    exog_vars=exog_vars,
    forecast_steps=len(subcategory_test['Date'].unique()),
    forecast_dates=subcategory_test['Date'].unique()
)

subcategory_forecasts.head()

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 36, Finished, Available, Finished)


=== SUBCATEGORYNAME FORECASTING ===

Forecasting for subcategoryname: Dispensing Pumps (10cc to 30cc)


Model weights - ARIMA: 0.314, SARIMA: 0.343, EXP: 0.343
  Forecast range: 6,322,723 - 10,625,641

Forecasting for subcategoryname: Drum Faucets


Model weights - ARIMA: 0.312, SARIMA: 0.331, EXP: 0.356
  Forecast range: -3,088 - 5,786

Forecasting for subcategoryname: Freight


Model weights - ARIMA: 0.302, SARIMA: 0.311, EXP: 0.387
  Forecast range: -17 - 87

Forecasting for subcategoryname: Gaskets & Vents


Model weights - ARIMA: 0.313, SARIMA: 0.344, EXP: 0.343
  Forecast range: -78,455 - 99,277

Forecasting for subcategoryname: Non-Product Sales


Model weights - ARIMA: 0.312, SARIMA: 0.341, EXP: 0.347
  Forecast range: 2,620 - 29,206

Forecasting for subcategoryname: Other


Model weights - ARIMA: 0.304, SARIMA: 0.356, EXP: 0.340
  Forecast range: -10,812 - 10,488

Forecasting for subcategoryname: Overcaps


Model weights - ARIMA: 0.305, SARIMA: 0.352, EXP: 0.342
  Forecast range: 2,904 - 12,470

Forecasting for subcategoryname: Plugs


Model weights - ARIMA: 0.313, SARIMA: 0.342, EXP: 0.345
  Forecast range: -34,843 - 52,824

Forecasting for subcategoryname: Press-Fit Closures


Model weights - ARIMA: 0.311, SARIMA: 0.348, EXP: 0.341
  Forecast range: 2,314,114 - 3,311,389

Forecasting for subcategoryname: Pull Out Spout Closures


Model weights - ARIMA: 0.313, SARIMA: 0.345, EXP: 0.342
  Forecast range: 2,619,187 - 4,233,803

Forecasting for subcategoryname: Purchased


Model weights - ARIMA: 0.311, SARIMA: 0.345, EXP: 0.344
  Forecast range: 4,210 - 84,673

Forecasting for subcategoryname: Screw Closures


Model weights - ARIMA: 0.312, SARIMA: 0.347, EXP: 0.341
  Forecast range: 3,066,033 - 4,986,374

Forecasting for subcategoryname: Specialty Closures


Model weights - ARIMA: 0.312, SARIMA: 0.344, EXP: 0.343
  Forecast range: 898,016 - 1,254,874

Forecasting for subcategoryname: Tools


Model weights - ARIMA: 0.223, SARIMA: 0.214, EXP: 0.563
  Forecast range: 2 - 4

Forecasting for subcategoryname: Wrenches


Model weights - ARIMA: 0.059, SARIMA: 0.845, EXP: 0.096
  Forecast range: -262 - 34

Forecasting for subcategoryname: Components


Model weights - ARIMA: 0.309, SARIMA: 0.349, EXP: 0.342
  Forecast range: -11,978 - 14,561

Forecasting for subcategoryname: Unknown SubCategory


Model weights - ARIMA: 0.141, SARIMA: 0.725, EXP: 0.134
  Forecast range: 1,282 - 95,799

Forecasting for subcategoryname: Proprietary Molded Solutions


Model weights - ARIMA: 0.333, SARIMA: 0.333, EXP: 0.333
  Forecast range: -279,507 - 27,537

Forecasting for subcategoryname: Special Caps


Model weights - ARIMA: 0.093, SARIMA: 0.812, EXP: 0.095
  Forecast range: -17,677 - 200,701

Total subcategoryname forecasts generated: 19
SubCategoryName forecast total range: -279,507 - 10,625,641


Unnamed: 0,Date,Ensemble,Level,Segment,SubCategoryName
0,2024-01-01,9420789.0,SubCategoryName,Dispensing Pumps (10cc to 30cc),Dispensing Pumps (10cc to 30cc)
1,2024-02-01,9654248.0,SubCategoryName,Dispensing Pumps (10cc to 30cc),Dispensing Pumps (10cc to 30cc)
2,2024-03-01,10625640.0,SubCategoryName,Dispensing Pumps (10cc to 30cc),Dispensing Pumps (10cc to 30cc)
3,2024-04-01,9343129.0,SubCategoryName,Dispensing Pumps (10cc to 30cc),Dispensing Pumps (10cc to 30cc)
4,2024-05-01,9516368.0,SubCategoryName,Dispensing Pumps (10cc to 30cc),Dispensing Pumps (10cc to 30cc)


In [31]:
print("Forecast Dates:")
print(FORECAST_DATES)

print("Overall Forecast Dates:")
print(overall_forecasts['Date'].unique())

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 37, Finished, Available, Finished)

Forecast Dates:
DatetimeIndex(['2024-01-01', '2024-02-01', '2024-03-01', '2024-04-01',
               '2024-05-01', '2024-06-01', '2024-07-01', '2024-08-01',
               '2024-09-01', '2024-10-01', '2024-11-01', '2024-12-01'],
              dtype='datetime64[ns]', freq='MS')
Overall Forecast Dates:
<DatetimeArray>
['2024-01-01 00:00:00', '2024-02-01 00:00:00', '2024-03-01 00:00:00',
 '2024-04-01 00:00:00', '2024-05-01 00:00:00', '2024-06-01 00:00:00',
 '2024-07-01 00:00:00', '2024-08-01 00:00:00', '2024-09-01 00:00:00',
 '2024-10-01 00:00:00', '2024-11-01 00:00:00', '2024-12-01 00:00:00']
Length: 12, dtype: datetime64[ns]


## Hierarchical Consistency & Reconciliation

We need to ensure that the sum of forecasts at lower levels equals the forecast at higher levels. This is called hierarchical reconciliation.

In [32]:
# Apply hierarchical reconciliation using the reusable function
segment_forecasts_reconciled, subcategory_forecasts_reconciled = hierarchical_reconciliation(
    overall_forecasts=overall_forecasts,
    segment_forecasts=segment_forecasts,
    subcategory_forecasts=subcategory_forecasts,
    segment_ts=segment_ts,
    subcategory_ts=subcategory_ts,
    forecast_dates=FORECAST_DATES
)

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 38, Finished, Available, Finished)

=== HIERARCHICAL RECONCILIATION ===

Before Reconciliation:
  2024-01: Overall=22,086,739, Segments Sum=21,087,147, SubCats Sum=20,630,227
  2024-02: Overall=21,490,913, Segments Sum=20,914,668, SubCats Sum=20,221,589
  2024-03: Overall=23,453,575, Segments Sum=23,549,544, SubCats Sum=22,402,828

Calculating historical proportions...
Customer Segment Proportions: {'B Customer - Norris': 0.0008975761389418995, 'Brand Owners': 0.006056581314054652, 'Container Manufacturers': 0.02173432535814578, 'Distribution': 0.03415475720890335, 'Drum & Pail Manufacturers': 0.05520713124017283, 'Fillers': 0.0412775397666951, 'Industrial Chemicals': 0.1622479288340308, 'InterCompany': 0.39784312281718315, 'Other': 0.011282463729157232, 'Pharmaceuticals & Nutraceuticals': 0.007378470047537353, 'Small Customers < $250k / year': 0.1932159427613124, 'Tier 1 Accounts': 0.06522179239364899, 'Unknown Segment': 0.0018964271929287446, 'Global Strategic Accounts': 0.0004271694579631516, 'Tier 2 Accounts': 0.0011

In [33]:
actual = overall_test['Total_Quantity'].values

mape_arima = mean_absolute_percentage_error(actual, overall_forecasts['ARIMA'].values)
mape_sarima = mean_absolute_percentage_error(actual, overall_forecasts['SARIMA'].values)
mape_exp = mean_absolute_percentage_error(actual, overall_forecasts['ExpSmoothing'].values)
mape_ensemble = mean_absolute_percentage_error(actual, overall_forecasts['Ensemble'].values)

print(f"MAPE ARIMA: {mape_arima:.2%}")
print(f"MAPE SARIMA: {mape_sarima:.2%}")
print(f"MAPE Exponential Smoothing: {mape_exp:.2%}")
print(f"MAPE Ensemble: {mape_ensemble:.2%}")


StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 39, Finished, Available, Finished)

MAPE ARIMA: 19.76%
MAPE SARIMA: 8.31%
MAPE Exponential Smoothing: 24.94%
MAPE Ensemble: 14.45%


## Forecast Visualization & Results

Let's visualize the hierarchical forecasts and compare them with historical data.

In [34]:
# Create comprehensive forecast visualization
fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=('Overall Forecast', 'Customer Segment Forecasts', 'SubCategory Forecasts'),
    vertical_spacing=0.08,
    specs=[[{"secondary_y": False}],
           [{"secondary_y": False}],
           [{"secondary_y": False}]]
)

# Overall forecast plot
fig.add_trace(
    go.Scatter(x=overall_ts['Date'], y=overall_ts['Total_Quantity'],
              mode='lines+markers', name='Historical Total',
              line=dict(color='darkblue', width=2)),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=overall_forecasts['Date'], y=overall_forecasts['Ensemble'],
              mode='lines+markers', name='Overall Forecast',
              line=dict(color='red', width=3, dash='dash')),
    row=1, col=1
)

# Customer segment forecasts
segment_colors = px.colors.qualitative.Set1
for i, segment in enumerate(segment_ts['CustomerSegment'].unique()):
    # Historical data
    segment_hist = segment_ts[segment_ts['CustomerSegment'] == segment]
    fig.add_trace(
        go.Scatter(x=segment_hist['Date'], y=segment_hist['Total_Quantity'],
                  mode='lines', name=f'{segment} (Historical)',
                  line=dict(color=segment_colors[i % len(segment_colors)], width=1),
                  showlegend=False),
        row=2, col=1
    )
    
    # Forecast data
    segment_forecast = segment_forecasts_reconciled[segment_forecasts_reconciled['CustomerSegment'] == segment]
    fig.add_trace(
        go.Scatter(x=segment_forecast['Date'], y=segment_forecast['Ensemble_Reconciled'],
                  mode='lines+markers', name=f'{segment} Forecast',
                  line=dict(color=segment_colors[i % len(segment_colors)], width=2, dash='dash')),
        row=2, col=1
    )

# SubCategory forecasts
subcat_colors = px.colors.qualitative.Set2
for i, subcat in enumerate(subcategory_ts['SubCategoryName'].unique()):
    # Historical data
    subcat_hist = subcategory_ts[subcategory_ts['SubCategoryName'] == subcat]
    fig.add_trace(
        go.Scatter(x=subcat_hist['Date'], y=subcat_hist['Total_Quantity'],
                  mode='lines', name=f'{subcat} (Historical)',
                  line=dict(color=subcat_colors[i % len(subcat_colors)], width=1),
                  showlegend=False),
        row=3, col=1
    )
    
    # Forecast data
    subcat_forecast = subcategory_forecasts_reconciled[subcategory_forecasts_reconciled['SubCategoryName'] == subcat]
    fig.add_trace(
        go.Scatter(x=subcat_forecast['Date'], y=subcat_forecast['Ensemble_Reconciled'],
                  mode='lines+markers', name=f'{subcat} Forecast',
                  line=dict(color=subcat_colors[i % len(subcat_colors)], width=2, dash='dash')),
        row=3, col=1
    )

fig.update_layout(
    height=1000,
    title_text="Hierarchical Sales Forecasting: Historical vs Predicted",
    showlegend=True,
    legend=dict(orientation="v", yanchor="top", y=1, xanchor="left", x=1.01)
)

fig.update_xaxes(title_text="Date")
fig.update_yaxes(title_text="Total Quantity")

fig.show()

# Print forecast summary
print("\n=== FORECAST SUMMARY ===")
print(f"Forecast Period: {FORECAST_DATES[0].strftime('%Y-%m')} to {FORECAST_DATES[-1].strftime('%Y-%m')}")
print(f"\nOverall Forecast Summary:")
print(f"  Mean Monthly Forecast: {overall_forecasts['Ensemble'].mean():,.0f}")
print(f"  Total 12-Month Forecast: {overall_forecasts['Ensemble'].sum():,.0f}")
print(f"  Min-Max Range: {overall_forecasts['Ensemble'].min():,.0f} - {overall_forecasts['Ensemble'].max():,.0f}")

print(f"\nCustomer Segment Forecast Summary:")
for segment in segment_forecasts_reconciled['CustomerSegment'].unique():
    segment_data = segment_forecasts_reconciled[segment_forecasts_reconciled['CustomerSegment'] == segment]
    total_forecast = segment_data['Ensemble_Reconciled'].sum()
    print(f"  {segment}: {total_forecast:,.0f} (12-month total)")

print(f"\nSubCategory Forecast Summary:")
for subcat in subcategory_forecasts_reconciled['SubCategoryName'].unique():
    subcat_data = subcategory_forecasts_reconciled[subcategory_forecasts_reconciled['SubCategoryName'] == subcat]
    total_forecast = subcat_data['Ensemble_Reconciled'].sum()
    print(f"  {subcat}: {total_forecast:,.0f} (12-month total)")

StatementMeta(, eb5e2a2e-c1b1-40a6-8722-a155dee1cfe5, 40, Finished, Available, Finished)


=== FORECAST SUMMARY ===
Forecast Period: 2024-01 to 2024-12

Overall Forecast Summary:
  Mean Monthly Forecast: 20,992,069
  Total 12-Month Forecast: 251,904,825
  Min-Max Range: 15,586,836 - 23,453,575

Customer Segment Forecast Summary:
  B Customer - Norris: 226,104 (12-month total)
  Brand Owners: 1,525,682 (12-month total)
  Container Manufacturers: 5,474,981 (12-month total)
  Distribution: 8,603,748 (12-month total)
  Drum & Pail Manufacturers: 13,906,943 (12-month total)
  Fillers: 10,398,011 (12-month total)
  Industrial Chemicals: 40,871,036 (12-month total)
  InterCompany: 100,218,602 (12-month total)
  Other: 2,842,107 (12-month total)
  Pharmaceuticals & Nutraceuticals: 1,858,672 (12-month total)
  Small Customers < $250k / year: 48,672,028 (12-month total)
  Tier 1 Accounts: 16,429,684 (12-month total)
  Unknown Segment: 477,719 (12-month total)
  Global Strategic Accounts: 107,606 (12-month total)
  Tier 2 Accounts: 291,900 (12-month total)

SubCategory Forecast Summar