# Parallel Processing Demo 2: Grouped/Panel Modeling

This notebook demonstrates parallel execution for grouped/panel data modeling:
- `Workflow.fit_nested()` - Fit separate models per group in parallel
- Per-group preprocessing with `per_group_prep=True`
- Progress tracking with `verbose=True`
- Per-group results analysis
- Performance comparisons (sequential vs parallel)

**Key Features Demonstrated:**
- ‚úÖ `n_jobs` parameter for grouped modeling
- ‚úÖ CPU warning system for panel data
- ‚úÖ Per-group preprocessing strategies
- ‚úÖ Speedup measurements for nested models
- ‚úÖ Per-group metrics and coefficient comparison

## Setup and Data Loading

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings

# py-tidymodels imports
from py_workflows import workflow
from py_parsnip import linear_reg
from py_recipes import recipe, all_numeric_predictors
from py_yardstick import metric_set, rmse, mae, r_squared
from py_tune.parallel_utils import get_cpu_count

# Seaborn styling
sns.set_style('whitegrid')

print("All imports successful!")

In [None]:
# Load data
raw_data = pd.read_csv('_md/__data/preem.csv')
df = raw_data.copy()
df['date'] = pd.to_datetime(df['date'])

print(f"Data shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
display(df.head())

## Create Grouped Data

We'll create 4 groups by assigning regions to simulate panel/grouped data.

In [None]:
# Create groups by splitting data into regions
# Assign regions cyclically to create balanced groups
n_groups = 4
regions = ['North', 'South', 'East', 'West']
df['region'] = [regions[i % n_groups] for i in range(len(df))]

# Check group sizes
group_counts = df['region'].value_counts().sort_index()
print("Group sizes:")
print(group_counts)

# Show sample
display(df[['date', 'target', 'region']].head(12))

In [None]:
# Define formula
FORMULA = "target ~ ."

print(f"Formula: {FORMULA}")
print(f"Groups: {df['region'].nunique()}")
print(f"Total observations: {len(df)}")
print(f"Observations per group: ~{len(df) // df['region'].nunique()}")

## System Information

In [None]:
# Check system resources
cpu_count = get_cpu_count()
print(f"‚úì Detected {cpu_count} CPU cores")
print(f"‚úì Joblib backend: loky (multiprocessing)")
print(f"\nThis system can efficiently run up to {cpu_count} parallel jobs.")
print(f"With {n_groups} groups, parallel execution should provide speedup.")

## Part 1: Basic Nested Modeling (Sequential vs Parallel)

First, we'll compare sequential vs parallel execution for basic grouped modeling.

In [None]:
# Create simple workflow
wf = workflow().add_formula(FORMULA).add_model(linear_reg())

print(f"Workflow: Linear Regression")
print(f"Groups: {n_groups} regions")

### Sequential Execution (Baseline)

In [None]:
# Sequential nested fitting
print("Running SEQUENTIAL fit_nested()...")
start = time.time()
nested_fit_seq = wf.fit_nested(
    df,
    group_col='region',
    n_jobs=1,  # Sequential
    verbose=True
)
seq_time = time.time() - start

print(f"\n‚úì Sequential execution completed in {seq_time:.2f} seconds")

In [None]:
# View sequential results
outputs_seq, coeffs_seq, stats_seq = nested_fit_seq.extract_outputs()

print("\nPer-group statistics (sequential):")
display(stats_seq[['group', 'n', 'split', 'rmse_mean', 'mae_mean', 'r_squared']].sort_values('group'))

### Parallel Execution with All Cores

In [None]:
# Parallel nested fitting
print(f"Running PARALLEL fit_nested() (n_jobs=-1, using all {cpu_count} cores)...")
start = time.time()
nested_fit_par = wf.fit_nested(
    df,
    group_col='region',
    n_jobs=-1,  # Use all cores
    verbose=True
)
par_time = time.time() - start

speedup = seq_time / par_time
efficiency = (speedup / cpu_count) * 100

print(f"\n‚úì Parallel execution completed in {par_time:.2f} seconds")
print(f"‚úì Speedup: {speedup:.2f}x")
print(f"‚úì Efficiency: {efficiency:.1f}%")

### Results Consistency Check

In [None]:
# Verify results are identical
outputs_par, coeffs_par, stats_par = nested_fit_par.extract_outputs()

print("Consistency Check (per group):")
for group in sorted(df['region'].unique()):
    stats_seq_group = stats_seq[stats_seq['group'] == group]
    stats_par_group = stats_par[stats_par['group'] == group]
    
    rmse_seq = stats_seq_group[stats_seq_group['split'] == 'train']['rmse'].values[0]
    rmse_par = stats_par_group[stats_par_group['split'] == 'train']['rmse'].values[0]
    
    match = np.isclose(rmse_seq, rmse_par, rtol=1e-10)
    status = "‚úì IDENTICAL" if match else "‚úó DIFFERENT"
    print(f"  {group}: {status} (RMSE: {rmse_seq:.4f})")

print("\n‚úì All parallel executions produce identical results to sequential!")

### Performance Comparison

In [None]:
# Create performance comparison
perf_df = pd.DataFrame({
    'Configuration': ['Sequential', f'Parallel ({cpu_count} cores)'],
    'n_jobs': [1, -1],
    'Time (s)': [seq_time, par_time],
    'Speedup': [1.0, speedup],
    'Efficiency (%)': [100.0, efficiency]
})

display(perf_df)

# Plot speedup
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Time comparison
ax1.bar(perf_df['Configuration'], perf_df['Time (s)'], color=['gray', 'green'])
ax1.set_ylabel('Time (seconds)')
ax1.set_title('fit_nested() Execution Time')
ax1.grid(axis='y', alpha=0.3)

# Speedup
ax2.bar(perf_df['Configuration'], perf_df['Speedup'], color=['gray', 'green'])
ax2.set_ylabel('Speedup (x)')
ax2.set_title('fit_nested() Speedup vs Sequential')
ax2.axhline(y=1, color='r', linestyle='--', label='Baseline')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Part 2: Per-Group Preprocessing

Demonstrate parallel execution with per-group preprocessing using PCA.

In [None]:
# Create workflow with PCA preprocessing
rec_pca = (
    recipe(df, FORMULA)
    .step_normalize(all_numeric_predictors())
    .step_pca(all_numeric_predictors(), num_comp=3)
)

wf_pca = workflow().add_recipe(rec_pca).add_model(linear_reg())

print("Workflow with PCA:")
print("  - Normalize numeric predictors")
print("  - PCA: Reduce to 3 components")
print("  - Linear regression")

### Global Preprocessing (Default)

In [None]:
# Global preprocessing: Same PCA transformation for all groups
print("Running fit_nested() with GLOBAL preprocessing...")
start = time.time()
nested_fit_global = wf_pca.fit_nested(
    df,
    group_col='region',
    per_group_prep=False,  # Global preprocessing
    n_jobs=-1,
    verbose=True
)
global_time = time.time() - start

print(f"\n‚úì Global preprocessing completed in {global_time:.2f} seconds")

In [None]:
# View global preprocessing results
_, _, stats_global = nested_fit_global.extract_outputs()

print("\nPer-group statistics (global preprocessing):")
display(stats_global[['group', 'split', 'rmse_mean', 'mae_mean', 'r_squared']].sort_values('group'))

### Per-Group Preprocessing

In [None]:
# Per-group preprocessing: Each group gets its own PCA transformation
print("Running fit_nested() with PER-GROUP preprocessing...")
start = time.time()
nested_fit_pergroup = wf_pca.fit_nested(
    df,
    group_col='region',
    per_group_prep=True,  # Per-group preprocessing
    n_jobs=-1,
    verbose=True
)
pergroup_time = time.time() - start

print(f"\n‚úì Per-group preprocessing completed in {pergroup_time:.2f} seconds")

In [None]:
# View per-group preprocessing results
_, _, stats_pergroup = nested_fit_pergroup.extract_outputs()

print("\nPer-group statistics (per-group preprocessing):")
display(stats_pergroup[['group', 'split', 'rmse_mean', 'mae_mean', 'r_squared']].sort_values('group'))

In [None]:
# Compare feature differences
feature_comparison = nested_fit_pergroup.get_feature_comparison()

print("\nFeature comparison across groups:")
display(feature_comparison)

### Preprocessing Strategy Comparison

In [None]:
# Compare global vs per-group preprocessing performance
comparison_df = pd.DataFrame({
    'Strategy': ['Global Preprocessing', 'Per-Group Preprocessing'],
    'Time (s)': [global_time, pergroup_time],
    'Avg RMSE': [
        stats_global[stats_global['split'] == 'train']['rmse'].mean(),
        stats_pergroup[stats_pergroup['split'] == 'train']['rmse'].mean()
    ],
    'Avg MAE': [
        stats_global[stats_global['split'] == 'train']['mae'].mean(),
        stats_pergroup[stats_pergroup['split'] == 'train']['mae'].mean()
    ],
    'Avg R¬≤': [
        stats_global[stats_global['split'] == 'train']['r_squared'].mean(),
        stats_pergroup[stats_pergroup['split'] == 'train']['r_squared'].mean()
    ]
})

display(comparison_df)

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

metrics = ['RMSE', 'MAE', 'R¬≤']
colors = ['blue', 'orange']

for i, (metric, col) in enumerate(zip(metrics, ['Avg RMSE', 'Avg MAE', 'Avg R¬≤'])):
    axes[i].bar(comparison_df['Strategy'], comparison_df[col], color=colors)
    axes[i].set_ylabel(metric)
    axes[i].set_title(f'Average {metric} by Strategy')
    axes[i].tick_params(axis='x', rotation=15)
    axes[i].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Part 3: Per-Group Results Analysis

In [None]:
# Extract and compare coefficients across groups
_, coeffs, _ = nested_fit_seq.extract_outputs()

# Pivot to compare coefficients
coeff_pivot = coeffs[coeffs['term'] != 'Intercept'].pivot(
    index='term',
    columns='group',
    values='estimate'
)

print("\nCoefficient comparison across groups:")
display(coeff_pivot.head(10))

# Calculate coefficient variance across groups
coeff_variance = coeff_pivot.var(axis=1).sort_values(ascending=False)
print("\nTop 5 most variable coefficients across groups:")
print(coeff_variance.head())

In [None]:
# Visualize coefficient heterogeneity
top_vars = coeff_variance.head(6).index
coeff_subset = coeff_pivot.loc[top_vars]

fig, ax = plt.subplots(figsize=(12, 6))
coeff_subset.T.plot(kind='bar', ax=ax, width=0.8)
ax.set_title('Coefficient Variation Across Groups (Top 6 Most Variable)')
ax.set_xlabel('Region')
ax.set_ylabel('Coefficient Estimate')
ax.legend(title='Feature', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print("\nüí° Coefficient variation suggests heterogeneity - nested modeling is appropriate!")

In [None]:
# Per-group performance comparison
stats_train = stats_seq[stats_seq['split'] == 'train'].sort_values('group')

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# RMSE by group
axes[0].bar(stats_train['group'], stats_train['rmse'], color='steelblue')
axes[0].set_ylabel('RMSE')
axes[0].set_title('RMSE by Group')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# MAE by group
axes[1].bar(stats_train['group'], stats_train['mae'], color='coral')
axes[1].set_ylabel('MAE')
axes[1].set_title('MAE by Group')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

# R¬≤ by group
axes[2].bar(stats_train['group'], stats_train['r_squared'], color='mediumseagreen')
axes[2].set_ylabel('R¬≤')
axes[2].set_title('R¬≤ by Group')
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Part 4: CPU Warning Demonstrations

### Warning: n_jobs > Number of Groups

In [None]:
# Trigger inefficiency warning
print(f"Data has {n_groups} groups. Requesting {cpu_count * 2} workers...\n")

with warnings.catch_warnings(record=True) as w:
    warnings.simplefilter("always")
    
    nested_fit_ineff = wf.fit_nested(
        df,
        group_col='region',
        n_jobs=cpu_count * 2,  # More workers than groups
        verbose=False
    )
    
    if w:
        print("‚ö†Ô∏è  WARNING TRIGGERED:")
        print(f"    {w[0].message}")
        print(f"\nüí° Recommendation: Use n_jobs={n_groups} (number of groups) instead")

## Summary and Recommendations

In [None]:
print("=" * 80)
print("PARALLEL GROUPED MODELING PERFORMANCE SUMMARY")
print("=" * 80)
print(f"\nSystem: {cpu_count} CPU cores")
print(f"Data: {len(df)} observations, {n_groups} groups")

print(f"\n1. Basic Nested Modeling")
print(f"   Sequential: {seq_time:.2f}s")
print(f"   Parallel (all cores): {par_time:.2f}s (speedup: {speedup:.2f}x)")

print(f"\n2. With PCA Preprocessing")
print(f"   Global preprocessing: {global_time:.2f}s")
print(f"   Per-group preprocessing: {pergroup_time:.2f}s")

print("\n" + "=" * 80)
print("RECOMMENDATIONS")
print("=" * 80)
print(f"\n‚úÖ Use parallel nested modeling (n_jobs=-1) when:")
print(f"   - You have multiple groups (>{cpu_count-1})")
print(f"   - Per-group models are computationally expensive")
print(f"   - Groups exhibit heterogeneous patterns (different coefficients)")
print(f"   - You need per-group predictions")

print(f"\n‚ö†Ô∏è  Consider sequential execution (n_jobs=1) when:")
print(f"   - Few groups (< {cpu_count})")
print(f"   - Simple/fast models")
print(f"   - Debugging (easier error tracing)")

print(f"\nüí° Preprocessing Tips:")
print(f"   - Use per_group_prep=True when groups have different feature distributions")
print(f"   - Use per_group_prep=False (default) for consistency across groups")
print(f"   - PCA, feature selection benefit from per-group preprocessing")
print(f"   - Check feature_comparison() to see preprocessing differences")

print(f"\nüí° Performance Tips:")
print(f"   - Set n_jobs to min(n_groups, cpu_count) for optimal efficiency")
print(f"   - Use verbose=True to monitor progress")
print(f"   - Watch for CPU warnings to avoid inefficiencies")
print("=" * 80)