# imports

In [None]:
import pysubgroup as ps
import pandas as pd
import numpy as np
import import_ipynb
import pickle
from pmdarima import auto_arima
from statsmodels.tsa.statespace.dynamic_factor import DynamicFactor
import os
import csv
import random
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
aggregated_data_usa = pd.read_pickle('AggregatedDataUSA.pkl')
aggregated_data_eu = pd.read_pickle('AggregatedDataEU.pkl')
aggregated_data_apa = pd.read_pickle('AggregatedDataAPA.pkl')


In [None]:
subgroups_eu = pd.read_csv('ResultsSgDisEU.csv')


In [None]:
subgroups_apa = pd.read_csv('ResultsSgDisAPA.csv')
subgroups_usa = pd.read_csv('ResultsSgDisUSA.csv')

In [None]:
subgroups_usa.head()

# Change Data Format

We change the data from a long to a wide format for ARIMA.

In [None]:
eu_long_df = aggregated_data_eu.copy()
usa_long_df = aggregated_data_usa.copy()
apa_long_df = aggregated_data_apa.copy()

In [None]:
eu_long_df = (
    eu_long_df
    .assign(IndividualPE=eu_long_df['8ClippedResampledInflationPE'])
    .explode('IndividualPE')
    .reset_index(drop=True)
)
eu_long_df['TimePeriod'] = eu_long_df.groupby('ItemNumber').cumcount() + 1

In [None]:
usa_long_df = (
    usa_long_df
    .assign(IndividualPE=usa_long_df['8ClippedResampledInflationPE'])
    .explode('IndividualPE')
    .reset_index(drop=True)
)
usa_long_df['TimePeriod'] = usa_long_df.groupby('ItemNumber').cumcount() + 1

In [None]:
apa_long_df = (
    apa_long_df
    .assign(IndividualPE=apa_long_df['8ClippedResampledInflationPE'])
    .explode('IndividualPE')
    .reset_index(drop=True)
)
apa_long_df['TimePeriod'] = apa_long_df.groupby('ItemNumber').cumcount() + 1

In [None]:
eu_wide_df = eu_long_df.pivot(
            index='TimePeriod',
            columns='ItemNumber',
            values='IndividualPE'
        )
eu_wide_df.columns.name = None

In [None]:
usa_wide_df = usa_long_df.pivot(
            index='TimePeriod',
            columns='ItemNumber',
            values='IndividualPE'
        )
usa_wide_df.columns.name = None

In [None]:
apa_wide_df = apa_long_df.pivot(
            index='TimePeriod',
            columns='ItemNumber',
            values='IndividualPE'
        )
apa_wide_df.columns.name = None

# Split Data in Subgroups

In [None]:
# Load the dict back into memory
with open('usa_subgroup_dfs_dict_indexed.pkl', 'rb') as f:
    usa_subgroup_dfs_dict_indexed = pickle.load(f)

# Verify you got it back
print(type(usa_subgroup_dfs_dict_indexed))  # should be <class 'dict'>
for sg_index, df in usa_subgroup_dfs_dict_indexed.items():
    print(f"Subgroup {sg_index!r} → {len(df)} rows")

In [None]:
# Load the dict back into memory
with open('eu_subgroup_dfs_dict_indexed.pkl', 'rb') as f:
    eu_subgroup_dfs_dict_indexed = pickle.load(f)

# Verify you got it back
print(type(eu_subgroup_dfs_dict_indexed))  # should be <class 'dict'>
for sg_index, df in eu_subgroup_dfs_dict_indexed.items():
    print(f"Subgroup {sg_index!r} → {len(df)} rows")

In [None]:
# Load the dict back into memory
with open('apa_subgroup_dfs_dict_indexed.pkl', 'rb') as f:
    apa_subgroup_dfs_dict_indexed = pickle.load(f)

# Verify you got it back
print(type(apa_subgroup_dfs_dict_indexed))  # should be <class 'dict'>
for sg_index, df in apa_subgroup_dfs_dict_indexed.items():
    print(f"Subgroup {sg_index!r} → {len(df)} rows")

In [None]:
def transform_subgroup_dfs(dfs_dict, long_name_tpl, wide_name_tpl):
    """
    For each subgroup DataFrame in dfs_dict, creates:
      - A long-form DataFrame named long_name_tpl.format(idx=idx)
      - A wide-form DataFrame named wide_name_tpl.format(idx=idx)
    
    Parameters:
    -----------
    dfs_dict : dict
        Mapping from subgroup index (int) to DataFrame containing
        ['ItemNumber', '8ClippedResampledInflationPE'].
    long_name_tpl : str
        Template for naming long-form globals
    wide_name_tpl : str
        Template for naming wide-form globals
    """
    cols = ['ItemNumber', '8ClippedResampledInflationPE']
    
    for idx, df in dfs_dict.items():
        # Generate long-form DataFrame
        long_name = long_name_tpl.format(idx=idx)
        df_long = (
            df[cols]
            .assign(IndividualPE=df['8ClippedResampledInflationPE'])
            .explode('IndividualPE')
            .reset_index(drop=True)
        )
        df_long['TimePeriod'] = df_long.groupby('ItemNumber').cumcount() + 1
        
        # Store in globals
        globals()[long_name] = df_long
        
        # Generate wide-form DataFrame
        wide_name = wide_name_tpl.format(idx=idx)
        df_wide = df_long.pivot(
            index='TimePeriod',
            columns='ItemNumber',
            values='IndividualPE'
        )
        df_wide.columns.name = None
        
        # Store in globals
        globals()[wide_name] = df_wide

In [None]:
transform_subgroup_dfs(
     eu_subgroup_dfs_dict_indexed,
     long_name_tpl="eu_sg_{idx}_long",
     wide_name_tpl="eu_sg_{idx}_wide"
 )

transform_subgroup_dfs(
     apa_subgroup_dfs_dict_indexed,
     long_name_tpl="apa_sg_{idx}_long",
     wide_name_tpl="apa_sg_{idx}_wide"
 )

transform_subgroup_dfs(
     usa_subgroup_dfs_dict_indexed,
     long_name_tpl="usa_sg_{idx}_long",
     wide_name_tpl="usa_sg_{idx}_wide"
 )

# ARIMA Prediction

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def hierarchical_mint_forecast(
    df_wide: pd.DataFrame,
    h: int = 1,
    seasonal: bool = False,
    m: int = 1
) -> pd.DataFrame:
    """
    Perform Hierarchical + (OLS‐style MinT) reconciliation on a subgroup.
    
    Parameters
    ----------
    df_wide : DataFrame
        Historical elasticity panel of shape (T × N), where
        - index = TimePeriod (1…8 or actual dates)
        - columns = ItemNumber
        - values = IndividualPE
    h : int
        Forecast horizon (e.g. 1 for one‐step‐ahead).
    seasonal : bool
        Whether to include seasonal terms in auto_arima.
    m : int
        Seasonal period (only if seasonal=True).
        
    Returns
    -------
    reconciled_df : DataFrame
        Reconciled bottom‐level forecasts of shape (h × N), with the same
        columns as df_wide, and a future index (TimePeriod T+1 … T+h).
    """
    # 1) Build the top‐level (aggregate) series
    #    You can sum or average; we’ll use the mean here:
    agg_series = df_wide.mean(axis=1)

    # 2) Fit ARIMA to the aggregate
    print("Starting ARIMA - training model on subgroup ...")
    model_agg = auto_arima(
        agg_series,
        seasonal=seasonal,
        m=m,
        error_action='ignore',
        suppress_warnings=True
    )
    print("Model trained on subgroup - staring general prediction...")
    #    Produce h‐step‐ahead aggregate forecast
    f_agg = model_agg.predict(n_periods=h)            # shape (h,)
    
    # 3) Fit bottom‐level models (one per product)
    product_ids = df_wide.columns.tolist()
    print(f"General prediction done! - starting individual product training on {len(product_ids)} products")
    bottom_forecasts = []
    count = 0
    passed_75 = False
    passed_50 = False
    passed_25 = False
    for pid in product_ids:
        count+=1
        if count / len(product_ids) >= 0.75 and passed_75 == False:
            print("75% of products trained!")
            passed_75 = True
        if count / len(product_ids) >= 0.5 and passed_50 == False:
            print("50% of products trained!")
            passed_50 = True
        if count / len(product_ids) >= 0.25 and passed_25 == False:
            print("25% of products trained!")
            passed_25 = True

        m_b = auto_arima(
            df_wide[pid],
            seasonal=seasonal,
            m=m,
            error_action='ignore',
            suppress_warnings=True
        )
        bottom_forecasts.append(m_b.predict(n_periods=h))
    
    print("Individual product training done! - starting reconciliation...")

    # Stack into array of shape (N_products × h)
    f_b = np.vstack(bottom_forecasts)                 # shape (N, h)
    n_products = f_b.shape[0]

    # 4) OLS‐style MinT reconciliation (identity W)
    #    Compute per‐horizon correction so bottoms sum to aggregate
    correction = (f_agg - f_b.sum(axis=0)) / n_products  # shape (h,)
    # force it into a 1-D NumPy array
    correction = np.asarray(correction)
    f_b_rec = f_b + correction[np.newaxis, :]           # shape (N, h)

    # 5) Wrap into a DataFrame with a future TimePeriod index
    last_period = df_wide.index.max()
    future_index = range(last_period + 1, last_period + 1 + h)
    reconciled_df = pd.DataFrame(
        f_b_rec.T,               # transpose to (h, N)
        index=future_index,
        columns=product_ids
    )
    print("Reconciliation done! Results are ready.")

    return reconciled_df

In [None]:
# Prepare a dict to hold reconciled forecasts per subgroup
usa_hierarchical_forecasts = {}

# Loop over your subgroup indices (assuming you still have usa_subgroup_dfs_dict_indexed)
for idx in usa_subgroup_dfs_dict_indexed:
    # Grab the wide DataFrame from globals

    print(f"Now predicting subgroup {idx}")
    wide_name = f"usa_sg_{idx}_wide"
    df_wide = globals()[wide_name]
    
    # Run the Hierarchical + MinT forecast (one‐step‐ahead)
    # You can change h=1 to whatever horizon you need
    reconciled_df = hierarchical_mint_forecast(df_wide, h=1, seasonal=False)
    
    # Store it
    usa_hierarchical_forecasts[idx] = reconciled_df

In [None]:
# Prepare a dict to hold reconciled forecasts per subgroup
eu_hierarchical_forecasts = {}

# Loop over your subgroup indices (assuming you still have eu_subgroup_dfs_dict_indexed)
for idx in eu_subgroup_dfs_dict_indexed:
    # Grab the wide DataFrame from globals

    print(f"Now predicting subgroup {idx}")
    wide_name = f"eu_sg_{idx}_wide"
    df_wide = globals()[wide_name]
    
    # Run the Hierarchical + MinT forecast (one‐step‐ahead)
    # You can change h=1 to whatever horizon you need
    reconciled_df = hierarchical_mint_forecast(df_wide, h=1, seasonal=False)
    
    # Store it
    eu_hierarchical_forecasts[idx] = reconciled_df

In [None]:
# Prepare a dict to hold reconciled forecasts per subgroup
apa_hierarchical_forecasts = {}

# Loop over your subgroup indices (assuming you still have apa_subgroup_dfs_dict_indexed)
for idx in apa_subgroup_dfs_dict_indexed:
    # Grab the wide DataFrame from globals

    print(f"Now predicting subgroup {idx}")
    wide_name = f"apa_sg_{idx}_wide"
    df_wide = globals()[wide_name]
    
    # Run the Hierarchical + MinT forecast (one‐step‐ahead)
    # You can change h=1 to whatever horizon you need
    reconciled_df = hierarchical_mint_forecast(df_wide, h=1, seasonal=False)
    
    # Store it
    apa_hierarchical_forecasts[idx] = reconciled_df

In [None]:
pd.set_option('display.precision', 4)
pd.set_option('display.float_format', '{:.4f}'.format)

# Evaluate Predictions 

In [None]:
# 1) Define Directional Magnitude Score (DMS)
def directional_magnitude_score(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    train: np.ndarray,
    w: float = 0.5,
    R: float = 25.0
) -> float:
    """
    Directional Magnitude Score (0–1):
      D = 1 if sign(y_pred) == sign(y_true), else 0
      M = max(0, 1 - abs(y_pred - y_true) / R)
      DMS = w * D + (1 - w) * M

    Returns np.nan if train is constant.
    """
    # Exclude constant-series
    if np.allclose(train, train[0]):
        return np.nan

    # Flatten in case of multi-step h > 1
    y_true = np.ravel(y_true)
    y_pred = np.ravel(y_pred)

    scores = []
    for yt, yp in zip(y_true, y_pred):
        # 1) Direction correctness
        D = 1.0 if np.sign(yt) == np.sign(yp) else 0.0
        # 2) Magnitude closeness
        M = max(0.0, 1.0 - abs(yp - yt) / R)
        # 3) Combined score
        scores.append(w * D + (1 - w) * M)

    return float(np.mean(scores))

def evaluate_hierarchical_dms_all(
    subgroup_indices,
    wide_name_tpl,
    folds: list = [5, 6, 7],
    h: int = 1,
    w: float = 0.5,
    R: float = 25.0
):
    """
    For each subgroup:
      • Computes the mean DMS (Directional Magnitude Score) across all products & folds,
        prints it.
      • Stores the mean DMS per product for later analysis.

    Returns
    -------
    subgroup_scores : dict
        { idx -> mean DMS over products }
    per_product_scores : dict
        { idx -> { product_id -> mean DMS for that product } }
    """
    subgroup_scores = {}
    per_product_scores = {}

    for idx in subgroup_indices:
        # grab the wide panel from globals
        df_wide = globals()[wide_name_tpl.format(idx=idx)]
        
        # prepare a place to collect per-fold DMS per product
        errors = {pid: [] for pid in df_wide.columns}

        # rolling‐origin backtest
        for t_end in folds:
            train    = df_wide.iloc[:t_end]
            test     = df_wide.iloc[t_end : t_end + h]
            forecast = hierarchical_mint_forecast(train, h=h)

            for pid in df_wide.columns:
                y_true       = test[pid].values
                y_pred       = forecast[pid].values
                train_series = train[pid].values
                score = directional_magnitude_score(
                    y_true, y_pred, train_series, w=w, R=R
                )
                errors[pid].append(score)

        # average over folds for each product
        mean_per_product = {
            pid: float(np.nanmean(scores))
            for pid, scores in errors.items()
        }
        per_product_scores[idx] = mean_per_product

        # average those product‐means into one subgroup score
        subgroup_mean = float(np.nanmean(list(mean_per_product.values())))
        subgroup_scores[idx] = subgroup_mean

        print(f"Subgroup {idx}: mean DMS = {subgroup_mean:.3f}")

    return subgroup_scores, per_product_scores




In [None]:
usa_indices = list(usa_subgroup_dfs_dict_indexed.keys())
usa_subgroup_dms, usa_product_dms = evaluate_hierarchical_dms_all(usa_indices, "usa_sg_{idx}_wide")

In [None]:
apa_indices = list(apa_subgroup_dfs_dict_indexed.keys())
apa_subgroup_dms, apa_product_dms = evaluate_hierarchical_dms_all(apa_indices, "apa_sg_{idx}_wide")

In [None]:
eu_indices = list(eu_subgroup_dfs_dict_indexed.keys())
eu_subgroup_dms, eu_product_dms = evaluate_hierarchical_dms_all(eu_indices, "eu_sg_{idx}_wide")

# Format Evaluation Scores

In [None]:
# 1) Flatten into a list of records
records = []
for subgroup_idx, prod_scores in usa_product_dms.items():
    for item, score in prod_scores.items():
        records.append({
            'ItemNumber': item,
            'Sg Index': subgroup_idx,
            'DMS': score,
            'Sg DMS': usa_subgroup_dms[subgroup_idx],
        })

# 2) Create DataFrame
usa_dms_df = pd.DataFrame(records)



print(usa_dms_df.head())

In [None]:
# 1) Flatten into a list of records
records = []
for subgroup_idx, prod_scores in apa_product_dms.items():
    for item, score in prod_scores.items():
        records.append({
            'ItemNumber': item,
            'Sg Index': subgroup_idx,
            'DMS': score,
            'Sg DMS': apa_subgroup_dms[subgroup_idx],
        })

# 2) Create DataFrame
apa_dms_df = pd.DataFrame(records)



print(apa_dms_df.head())

In [None]:
# 1) Flatten into a list of records
records = []
for subgroup_idx, prod_scores in eu_product_dms.items():
    for item, score in prod_scores.items():
        records.append({
            'ItemNumber': item,
            'Sg Index': subgroup_idx,
            'DMS': score,
            'Sg DMS': eu_subgroup_dms[subgroup_idx],
        })

# 2) Create DataFrame
eu_dms_df = pd.DataFrame(records)



print(eu_dms_df.head())

In [None]:
# Fill NaN DMS with Sg DMS
usa_dms_df['DMS'] = usa_dms_df['DMS'].fillna(usa_dms_df['Sg DMS'])

apa_dms_df['DMS'] = apa_dms_df['DMS'].fillna(apa_dms_df['Sg DMS'])

eu_dms_df['DMS'] = eu_dms_df['DMS'].fillna(eu_dms_df['Sg DMS'])

In [None]:
usa_dms_df['DMS'] = usa_dms_df['DMS'].fillna(usa_dms_df['Sg DMS'])

# Format Predicted Elasticities

In [None]:
# 1) Build a flat mapping from ItemNumber to its one‐step‐ahead forecast
predicted_pe_usa = {}
for idx, forecast_df in usa_hierarchical_forecasts.items():
    # forecast_df is a (1 × N) DataFrame whose columns are ItemNumbers
    # and whose single row is the predicted PE for that subgroup
    row = forecast_df.iloc[0]           # a Series: index=ItemNumber, value=Predicted PE
    predicted_pe_usa.update(row.to_dict())  # add all item→PE pairs into our dict

# 2) Map that into your df_all_products
usa_dms_df['Predicted PE'] = usa_dms_df['ItemNumber'].map(predicted_pe_usa)

In [None]:
# 1) Build a flat mapping from ItemNumber to its one‐step‐ahead forecast
predicted_pe_apa = {}
for idx, forecast_df in apa_hierarchical_forecasts.items():
    # forecast_df is a (1 × N) DataFrame whose columns are ItemNumbers
    # and whose single row is the predicted PE for that subgroup
    row = forecast_df.iloc[0]           # a Series: index=ItemNumber, value=Predicted PE
    predicted_pe_apa.update(row.to_dict())  # add all item→PE pairs into our dict

# 2) Map that into your df_all_products
apa_dms_df['Predicted PE'] = apa_dms_df['ItemNumber'].map(predicted_pe_apa)

In [None]:
# 1) Build a flat mapping from ItemNumber to its one‐step‐ahead forecast
predicted_pe_eu = {}
for idx, forecast_df in eu_hierarchical_forecasts.items():
    # forecast_df is a (1 × N) DataFrame whose columns are ItemNumbers
    # and whose single row is the predicted PE for that subgroup
    row = forecast_df.iloc[0]           # a Series: index=ItemNumber, value=Predicted PE
    predicted_pe_eu.update(row.to_dict())  # add all item→PE pairs into our dict

# 2) Map that into your df_all_products
eu_dms_df['Predicted PE'] = eu_dms_df['ItemNumber'].map(predicted_pe_eu)

# Save Results

In [None]:
usa_dms_df.to_pickle('prediction_results_usa.pkl')
usa_dms_df.to_csv('prediction_results_usa.csv', index=False)

In [None]:
apa_dms_df.to_pickle('prediction_results_apa.pkl')
apa_dms_df.to_csv('prediction_results_apa.csv', index=False)

In [None]:
eu_dms_df.to_pickle('prediction_results_eu.pkl')
eu_dms_df.to_csv('prediction_results_eu.csv', index=False)