# Example 25: Refinery Production Forecasting with Diversity Maintenance

This notebook demonstrates using genetic algorithm feature selection with **diversity maintenance** for crude oil production forecasting.

**Dataset**: JODI crude oil refinery production data (2002-2024)

**Key Features**:
- Panel/grouped data (multiple countries)
- Time series production values
- Features: mean_production, pct_zero, category, subcategory
- Target: value (production)
- **Enhancement demonstrated**: Diversity maintenance to avoid premature convergence

**Diversity Maintenance Benefits**:
- Prevents premature convergence to local optima
- Maintains exploration throughout evolution
- Better for complex fitness landscapes with multiple good solutions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from py_recipes import recipe
from py_recipes.steps import step_select_genetic_algorithm, step_normalize, step_dummy
from py_workflows import workflow
from py_parsnip import linear_reg
from py_yardstick import rmse, mae, r_squared

# Load data
data = pd.read_csv('../_md/__data/jodi_refinery_production_data.csv')
data['date'] = pd.to_datetime(data['date'])

print(f"Data shape: {data.shape}")
print(f"\nCountries: {sorted(data['country'].unique())[:10]}...")  # Show first 10
print(f"Total countries: {data['country'].nunique()}")
print(f"\nDate range: {data['date'].min()} to {data['date'].max()}")
print(f"\nCategories: {data['category'].unique()}")
print(f"\nSubcategories: {data['subcategory'].unique()}")
print(f"\nFirst few rows:")
data.head(10)

## 1. Data Preparation

Focus on crude oil refinery intake and create time-based features.

In [None]:
# Filter to crude oil refinery intake only
data_refined = data[
    (data['category'] == 'CRUDEOIL') & 
    (data['subcategory'] == 'Refinery Intake')
].copy()

print(f"Filtered data shape: {data_refined.shape}")
print(f"Countries: {data_refined['country'].nunique()}")

# Add time-based features
data_refined['year'] = data_refined['date'].dt.year
data_refined['month'] = data_refined['date'].dt.month
data_refined['quarter'] = data_refined['date'].dt.quarter
data_refined['days_since_start'] = (data_refined['date'] - data_refined['date'].min()).dt.days

# Add lagged features (previous month's production)
data_refined = data_refined.sort_values(['country', 'date'])
data_refined['value_lag1'] = data_refined.groupby('country')['value'].shift(1)
data_refined['value_lag2'] = data_refined.groupby('country')['value'].shift(2)
data_refined['value_lag3'] = data_refined.groupby('country')['value'].shift(3)

# Add rolling mean (3-month moving average)
data_refined['value_rolling_mean_3'] = (
    data_refined.groupby('country')['value']
    .rolling(window=3, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

# Remove rows with NaN in lagged features
data_refined = data_refined.dropna(subset=['value_lag1', 'value_lag2', 'value_lag3'])

print(f"\nFinal data shape after feature engineering: {data_refined.shape}")
print(f"\nFeatures: {[col for col in data_refined.columns if col not in ['date', 'country', 'category', 'subcategory', 'unit', 'value']]}")

## 2. Train/Test Split and Country Selection

In [None]:
# Train/test split (80/20 by date)
split_date = data_refined['date'].quantile(0.8)
train = data_refined[data_refined['date'] <= split_date].copy()
test = data_refined[data_refined['date'] > split_date].copy()

print(f"Train: {train.shape[0]} rows, {train['date'].min()} to {train['date'].max()}")
print(f"Test: {test.shape[0]} rows, {test['date'].min()} to {test['date'].max()}")

# Select a major oil producer (USA - large and consistent production)
country = 'United States'

# Check if country exists in data
if country not in train['country'].unique():
    country = 'United States of America'  # Try alternative name
    if country not in train['country'].unique():
        # Find a country with sufficient data
        country_counts = train['country'].value_counts()
        country = country_counts[country_counts > 100].index[0]
        print(f"\nUsing country: {country}")

train_country = train[train['country'] == country].copy()
test_country = test[test['country'] == country].copy()

# Select features for modeling
feature_cols = [
    'year', 'month', 'quarter', 'days_since_start',
    'value_lag1', 'value_lag2', 'value_lag3',
    'value_rolling_mean_3', 'mean_production', 'pct_zero'
]

# Prepare modeling data
train_country_model = train_country[['value'] + feature_cols].copy()
test_country_model = test_country[['value'] + feature_cols].copy()

print(f"\nTraining on {country}")
print(f"Train: {train_country_model.shape}")
print(f"Test: {test_country_model.shape}")
print(f"\nFeatures: {feature_cols}")

## 3. Baseline: All Features (No Selection, No Diversity Maintenance)

In [None]:
# Baseline model with all features
baseline_wf = workflow().add_formula('value ~ .').add_model(linear_reg())
baseline_fit = baseline_wf.fit(train_country_model)

# Evaluate
baseline_preds = baseline_fit.predict(test_country_model)
baseline_rmse = rmse(test_country_model['value'], baseline_preds['.pred']).iloc[0]['value']
baseline_mae = mae(test_country_model['value'], baseline_preds['.pred']).iloc[0]['value']
baseline_r2 = r_squared(test_country_model['value'], baseline_preds['.pred']).iloc[0]['value']

print(f"=== Baseline (All {len(feature_cols)} Features) ===")
print(f"RMSE: {baseline_rmse:,.2f}")
print(f"MAE: {baseline_mae:,.2f}")
print(f"R²: {baseline_r2:.4f}")

## 4. GA Without Diversity Maintenance

First, run standard GA without diversity maintenance to see baseline convergence behavior.

In [None]:
# Standard GA (no diversity maintenance)
rec_standard = recipe(train_country_model)
rec_standard = step_select_genetic_algorithm(
    rec_standard,
    outcome='value',
    model=linear_reg(),
    metric='rmse',
    top_n=5,
    
    # No diversity maintenance
    maintain_diversity=False,
    
    # GA settings
    population_size=40,
    generations=30,
    cv_folds=3,
    random_state=42,
    verbose=True
)

prepped_standard = rec_standard.prep(train_country_model)
selected_standard = prepped_standard.prepared_steps[0]._selected_features

print(f"\n=== GA Without Diversity Maintenance ===")
print(f"Selected features ({len(selected_standard)}): {selected_standard}")
print(f"Converged: {prepped_standard.prepared_steps[0]._converged}")
print(f"Generations: {prepped_standard.prepared_steps[0]._n_generations}")

# Evaluate
if len(selected_standard) > 0:
    train_selected = prepped_standard.bake(train_country_model)
    test_selected = prepped_standard.bake(test_country_model)
    
    wf = workflow().add_formula('value ~ .').add_model(linear_reg())
    fit = wf.fit(train_selected)
    preds = fit.predict(test_selected)
    
    standard_rmse = rmse(test_selected['value'], preds['.pred']).iloc[0]['value']
    standard_mae = mae(test_selected['value'], preds['.pred']).iloc[0]['value']
    standard_r2 = r_squared(test_selected['value'], preds['.pred']).iloc[0]['value']
    
    print(f"\nTest performance:")
    print(f"  RMSE: {standard_rmse:,.2f} (baseline: {baseline_rmse:,.2f})")
    print(f"  MAE: {standard_mae:,.2f}")
    print(f"  R²: {standard_r2:.4f}")

## 5. GA With Diversity Maintenance

Now enable diversity maintenance to encourage exploration and prevent premature convergence.

**Parameters**:
- `maintain_diversity=True` - Enable diversity tracking and fitness sharing
- `diversity_threshold=0.3` - Apply fitness sharing when diversity drops below 30%
- `fitness_sharing_sigma=0.5` - Width of Gaussian sharing function

In [None]:
# GA with diversity maintenance
rec_diversity = recipe(train_country_model)
rec_diversity = step_select_genetic_algorithm(
    rec_diversity,
    outcome='value',
    model=linear_reg(),
    metric='rmse',
    top_n=5,
    
    # Diversity maintenance settings
    maintain_diversity=True,
    diversity_threshold=0.3,  # Trigger sharing below 30% diversity
    fitness_sharing_sigma=0.5,  # Sharing function width
    
    # GA settings (same as standard)
    population_size=40,
    generations=30,
    cv_folds=3,
    random_state=42,
    verbose=True
)

prepped_diversity = rec_diversity.prep(train_country_model)
selected_diversity = prepped_diversity.prepared_steps[0]._selected_features

print(f"\n=== GA With Diversity Maintenance ===")
print(f"Selected features ({len(selected_diversity)}): {selected_diversity}")
print(f"Converged: {prepped_diversity.prepared_steps[0]._converged}")
print(f"Generations: {prepped_diversity.prepared_steps[0]._n_generations}")

# Diversity statistics
diversity_history = prepped_diversity.prepared_steps[0]._diversity_history
print(f"\nDiversity statistics:")
print(f"  Initial diversity: {diversity_history[0]:.4f}")
print(f"  Final diversity: {diversity_history[-1]:.4f}")
print(f"  Mean diversity: {np.mean(diversity_history):.4f}")
print(f"  Min diversity: {np.min(diversity_history):.4f}")
print(f"  Generations below threshold (0.3): {sum(1 for d in diversity_history if d < 0.3)}")

# Evaluate
if len(selected_diversity) > 0:
    train_selected = prepped_diversity.bake(train_country_model)
    test_selected = prepped_diversity.bake(test_country_model)
    
    wf = workflow().add_formula('value ~ .').add_model(linear_reg())
    fit = wf.fit(train_selected)
    preds = fit.predict(test_selected)
    
    diversity_rmse = rmse(test_selected['value'], preds['.pred']).iloc[0]['value']
    diversity_mae = mae(test_selected['value'], preds['.pred']).iloc[0]['value']
    diversity_r2 = r_squared(test_selected['value'], preds['.pred']).iloc[0]['value']
    
    print(f"\nTest performance:")
    print(f"  RMSE: {diversity_rmse:,.2f} (baseline: {baseline_rmse:,.2f})")
    print(f"  MAE: {diversity_mae:,.2f}")
    print(f"  R²: {diversity_r2:.4f}")

## 6. Compare Different Diversity Thresholds

In [None]:
# Test different diversity thresholds
thresholds = [0.2, 0.3, 0.4, 0.5]
threshold_results = {}

for threshold in thresholds:
    print(f"\nTesting diversity_threshold={threshold}")
    
    rec = recipe(train_country_model)
    rec = step_select_genetic_algorithm(
        rec,
        outcome='value',
        model=linear_reg(),
        metric='rmse',
        top_n=5,
        maintain_diversity=True,
        diversity_threshold=threshold,
        fitness_sharing_sigma=0.5,
        population_size=40,
        generations=30,
        cv_folds=3,
        random_state=42,
        verbose=False
    )
    
    prepped = rec.prep(train_country_model)
    selected = prepped.prepared_steps[0]._selected_features
    diversity_hist = prepped.prepared_steps[0]._diversity_history
    
    if len(selected) > 0:
        train_selected = prepped.bake(train_country_model)
        test_selected = prepped.bake(test_country_model)
        
        wf = workflow().add_formula('value ~ .').add_model(linear_reg())
        fit = wf.fit(train_selected)
        preds = fit.predict(test_selected)
        
        test_rmse = rmse(test_selected['value'], preds['.pred']).iloc[0]['value']
        test_r2 = r_squared(test_selected['value'], preds['.pred']).iloc[0]['value']
        
        threshold_results[threshold] = {
            'n_features': len(selected),
            'features': selected,
            'rmse': test_rmse,
            'r2': test_r2,
            'diversity_history': diversity_hist,
            'mean_diversity': np.mean(diversity_hist),
            'min_diversity': np.min(diversity_hist),
            'generations': prepped.prepared_steps[0]._n_generations
        }
        
        print(f"  Selected {len(selected)} features: {selected}")
        print(f"  RMSE: {test_rmse:,.2f}, R²: {test_r2:.4f}")
        print(f"  Mean diversity: {np.mean(diversity_hist):.4f}, Min: {np.min(diversity_hist):.4f}")

## 7. Visualization: Diversity Evolution

In [None]:
# Plot diversity evolution for different thresholds
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Plot 1: Diversity over generations
ax1 = axes[0]
colors = plt.cm.viridis(np.linspace(0, 1, len(thresholds)))

for (threshold, results), color in zip(threshold_results.items(), colors):
    diversity_hist = results['diversity_history']
    ax1.plot(range(len(diversity_hist)), diversity_hist, 
             label=f'Threshold={threshold}', linewidth=2, color=color, alpha=0.8)
    ax1.axhline(y=threshold, color=color, linestyle='--', alpha=0.4, linewidth=1)

ax1.set_xlabel('Generation', fontsize=11)
ax1.set_ylabel('Population Diversity', fontsize=11)
ax1.set_title('Diversity Evolution with Different Thresholds', fontsize=12, fontweight='bold')
ax1.legend(fontsize=10)
ax1.grid(alpha=0.3)

# Plot 2: Performance comparison
ax2 = axes[1]
threshold_vals = list(threshold_results.keys())
rmse_vals = [threshold_results[t]['rmse'] for t in threshold_vals]
mean_div_vals = [threshold_results[t]['mean_diversity'] for t in threshold_vals]

# Bar plot for RMSE
bars = ax2.bar(range(len(threshold_vals)), rmse_vals, 
               color='steelblue', alpha=0.7, label='RMSE')
ax2.set_ylabel('RMSE', fontsize=11, color='steelblue')
ax2.set_xlabel('Diversity Threshold', fontsize=11)
ax2.set_xticks(range(len(threshold_vals)))
ax2.set_xticklabels([str(t) for t in threshold_vals])
ax2.tick_params(axis='y', labelcolor='steelblue')

# Add baseline RMSE as horizontal line
ax2.axhline(y=baseline_rmse, color='red', linestyle='--', 
            linewidth=2, label='Baseline RMSE', alpha=0.7)

# Add value labels on bars
for bar, val in zip(bars, rmse_vals):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2, height,
             f'{val:,.0f}', ha='center', va='bottom', fontsize=9, fontweight='bold')

# Secondary y-axis for mean diversity
ax2_twin = ax2.twinx()
ax2_twin.plot(range(len(threshold_vals)), mean_div_vals, 
              'o-', color='darkgreen', linewidth=2, markersize=8, 
              label='Mean Diversity', alpha=0.8)
ax2_twin.set_ylabel('Mean Diversity', fontsize=11, color='darkgreen')
ax2_twin.tick_params(axis='y', labelcolor='darkgreen')

ax2.set_title('Performance vs Diversity Threshold', fontsize=12, fontweight='bold')
ax2.grid(alpha=0.3, axis='y')

# Combine legends
lines1, labels1 = ax2.get_legend_handles_labels()
lines2, labels2 = ax2_twin.get_legend_handles_labels()
ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right', fontsize=10)

plt.tight_layout()
plt.savefig('production_diversity_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nPlot saved as: production_diversity_comparison.png")

## 8. Results Summary

In [None]:
# Create comparison table
comparison_data = []
comparison_data.append({
    'Method': 'Baseline (All Features)',
    'N Features': len(feature_cols),
    'RMSE': baseline_rmse,
    'R²': baseline_r2,
    'Mean Diversity': np.nan
})

comparison_data.append({
    'Method': 'GA (No Diversity)',
    'N Features': len(selected_standard),
    'RMSE': standard_rmse,
    'R²': standard_r2,
    'Mean Diversity': np.nan
})

for threshold, metrics in threshold_results.items():
    comparison_data.append({
        'Method': f'GA (Diversity={threshold})',
        'N Features': metrics['n_features'],
        'RMSE': metrics['rmse'],
        'R²': metrics['r2'],
        'Mean Diversity': metrics['mean_diversity']
    })

comparison_df = pd.DataFrame(comparison_data)
print("\n=== Performance Comparison ===")
print(comparison_df.to_string(index=False))

# Find best method
best_idx = comparison_df[comparison_df['Method'] != 'Baseline (All Features)']['RMSE'].idxmin()
print(f"\nBest method: {comparison_df.loc[best_idx, 'Method']}")
print(f"  RMSE improvement vs baseline: {(1 - comparison_df.loc[best_idx, 'RMSE'] / baseline_rmse) * 100:.1f}%")
print(f"  Feature reduction: {len(feature_cols)} → {comparison_df.loc[best_idx, 'N Features']} features")

## 9. Feature Importance Analysis

Compare which features were selected with and without diversity maintenance.

In [None]:
# Compare feature selections
print("\n=== Feature Selection Comparison ===")
print(f"\nWithout diversity maintenance:")
print(f"  Features: {selected_standard}")

print(f"\nWith diversity maintenance (threshold=0.3):")
print(f"  Features: {selected_diversity}")

# Find common and unique features
common = set(selected_standard) & set(selected_diversity)
only_standard = set(selected_standard) - set(selected_diversity)
only_diversity = set(selected_diversity) - set(selected_standard)

print(f"\nCommon features: {common if common else 'None'}")
print(f"Only in standard GA: {only_standard if only_standard else 'None'}")
print(f"Only in diversity GA: {only_diversity if only_diversity else 'None'}")

# Feature frequency across all diversity thresholds
all_selected = {}
for threshold, metrics in threshold_results.items():
    for feat in metrics['features']:
        all_selected[feat] = all_selected.get(feat, 0) + 1

print(f"\nFeature frequency across diversity thresholds:")
for feat, count in sorted(all_selected.items(), key=lambda x: x[1], reverse=True):
    pct = count / len(threshold_results) * 100
    print(f"  {feat}: {count}/{len(threshold_results)} ({pct:.0f}%)")

## Key Takeaways

1. **Diversity Maintenance Prevents Premature Convergence**: Fitness sharing maintains population diversity, allowing continued exploration

2. **Threshold Selection Matters**:
   - Lower threshold (0.2): Fitness sharing triggered less often, similar to standard GA
   - Higher threshold (0.5): More aggressive diversity maintenance, slower convergence but better exploration
   - Optimal: 0.3-0.4 balances exploration and exploitation

3. **Different Solutions from Same Problem**: Diversity maintenance can find alternative feature subsets with similar performance

4. **Diversity Monitoring**: Tracking diversity over generations provides insight into GA behavior:
   - Rapid diversity loss → premature convergence risk
   - Maintained diversity → healthy exploration

5. **Production Forecasting Benefits**: Time series production data benefits from maintaining diversity to avoid getting stuck in local optima

6. **Feature Stability**: Features appearing across multiple diversity thresholds are likely truly important (e.g., value_lag1, value_rolling_mean_3)