# 05. A/B Testing Simulation: Naive vs Smart

## Objective
To quantify the "Efficiency Gain" of our **Smart (Unmet Demand)** algorithm against a **Population-Weighted Baseline**.
This simulates the impact of investing in *N* chargers under both strategies.

## Scenarios
1.  **Scenario A (Baseline):** Distribute *N* chargers proportional to **Vehicle Density** (Population Weighted). This mimics a standard policy approach.
2.  **Scenario B (Smart):** Use Weighted K-Means on **Unmet_Demand** to place *N* chargers. This targets underserved high-demand areas.

## Metrics
*   **Population Served (Potential):** Sum of EV/Vehicle density in served areas.
*   **Unmet Demand Covered:** Sum of `Unmet_Demand` score in served areas.
*   **Efficiency Gain:** % Improvement of Smart over Baseline.

In [4]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.cluster import KMeans
from shapely.geometry import Point
import random
import matplotlib.pyplot as plt
import os

# 1. Load Data
DATA_PATH = '../data/processed/barrios_with_demand.geojson'
if not os.path.exists(DATA_PATH):
    DATA_PATH = 'data/processed/barrios_with_demand.geojson'

gdf = gpd.read_file(DATA_PATH)

# Ensure Centroids and Unmet Demand exist (re-calculate if needed for consistency)
gdf['centroid'] = gdf.geometry.centroid
gdf['lat'] = gdf.centroid.y
gdf['lng'] = gdf.centroid.x

# Recalculate Unmet Demand logic here to be self-contained
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
gdf['Norm_Supply'] = scaler.fit_transform(gdf[['Charger_Count']].fillna(0))
SUPPLY_IMPACT = 80 
gdf['Unmet_Demand'] = gdf['Demand_Score'] - (gdf['Norm_Supply'] * SUPPLY_IMPACT)
gdf['Unmet_Demand'] = gdf['Unmet_Demand'].clip(lower=0)

print(f"Loaded {len(gdf)} neighborhoods.")

Loaded 73 neighborhoods.



  gdf['centroid'] = gdf.geometry.centroid

  gdf['lat'] = gdf.centroid.y

  gdf['lng'] = gdf.centroid.x


## Define Simulation Functions

In [5]:
def generate_population_weighted_locations(n, gdf):
    """
    Generates N locations distributed proportionally to Vehicle Density.
    This mimics a 'Standard Policy' approach (more cars = more chargers).
    """
    points = []
    
    # 1. Select Neighborhoods weighted by Total Vehicles
    # Handle NaNs in weights
    weights = gdf['Total_Vehicles'].fillna(0)
    # Normalize weights to sum to 1
    weights = weights / weights.sum()
    
    # Sample n neighborhoods (indexes) with replacement
    sampled_indices = np.random.choice(gdf.index, size=n, p=weights)
    
    # 2. Place a random point in each selected neighborhood
    for idx in sampled_indices:
        poly = gdf.loc[idx, 'geometry']
        min_x, min_y, max_x, max_y = poly.bounds
        
        while True:
            rand_x = random.uniform(min_x, max_x)
            rand_y = random.uniform(min_y, max_y)
            p = Point(rand_x, rand_y)
            if poly.contains(p):
                points.append([p.y, p.x]) # Lat, Lng
                break
                
    return np.array(points)

def generate_smart_locations(n, df_features, weights):
    """Runs Weighted K-Means to find optimal locations."""
    # Safety Check: Cannot request more clusters than data points (neighborhoods)
    n_samples = len(df_features)
    n_clusters = n
    if n > n_samples:
        print(f"Warning: Requested {n} hubs but only {n_samples} neighborhoods available. Capping optimization at {n_samples}.")
        n_clusters = n_samples
        
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    kmeans.fit(df_features, sample_weight=weights)
    return kmeans.cluster_centers_

def evaluate_coverage(locations, gdf_target, radius_deg=0.005):
    """
    Simple evaluation: Sum metrics of neighborhoods that are 'close' to a charger.
    radius_deg: Approx 500m radius (~0.005 deg)
    """
    # Let's map points to containing polygons
    loc_df = pd.DataFrame(locations, columns=['lat', 'lng'])
    loc_gdf = gpd.GeoDataFrame(loc_df, geometry=gpd.points_from_xy(loc_df.lng, loc_df.lat), crs=gdf_target.crs)
    
    # Spatial Join: Find which barrio each charger falls into
    joined = gpd.sjoin(loc_gdf, gdf_target[['geometry', 'Barri_ID', 'Unmet_Demand', 'EV_Count', 'Total_Vehicles']], how='left', predicate='within')
    
    # Unique Barrios served (a barrio with 2 chargers counts once for coverage, effectively)
    served_stats = joined.drop_duplicates(subset=['Barri_ID'])
    
    total_pop_served = served_stats['Total_Vehicles'].sum()
    total_unmet_covered = served_stats['Unmet_Demand'].sum()
    
    return {
        'served_barrios_count': len(served_stats),
        'population_served': total_pop_served,
        'unmet_demand_captured': total_unmet_covered
    }

## Run Simulation Loop

In [6]:
# Config
SCENARIOS = [10, 25, 50] # Adjusted to stay safely under 73 limit
results = []
locations_export = []

X_coords = gdf[['lat', 'lng']].values
W_weights = gdf['Unmet_Demand'].fillna(0).values

for n in SCENARIOS:
    print(f"--- Simulating N={n} --- ")
    
    # 1. Baseline (Population Weighted)
    base_locs = generate_population_weighted_locations(n, gdf)
    base_metrics = evaluate_coverage(base_locs, gdf)
    
    results.append({
        'N_Chargers': n,
        'Strategy': 'Baseline',
        **base_metrics
    })
    
    # Add to Export List
    for i, loc in enumerate(base_locs):
        locations_export.append({
            'Scenario_ID': f'Baseline_{n}',
            'Type': 'Baseline',
            'N_Chargers': n,
            'Lat': loc[0],
            'Lng': loc[1],
            'Hub_ID': i+1
        })

    # 2. Smart (Unmet Demand)
    smart_locs = generate_smart_locations(n, X_coords, W_weights)
    smart_metrics = evaluate_coverage(smart_locs, gdf)
    
    results.append({
        'N_Chargers': n,
        'Strategy': 'Smart',
        **smart_metrics
    })
    
    # Add to Export List
    for i, loc in enumerate(smart_locs):
        locations_export.append({
            'Scenario_ID': f'Smart_{n}',
            'Type': 'Smart',
            'N_Chargers': n,
            'Lat': loc[0],
            'Lng': loc[1],
            'Hub_ID': i+1
        })
    
print("Simulation Complete.")

--- Simulating N=10 --- 
--- Simulating N=25 --- 
--- Simulating N=50 --- 
Simulation Complete.


## Analysis and KPI Calculation

In [7]:
df_res = pd.DataFrame(results)

# Pivot to compare side-by-side
df_pivot = df_res.pivot(index='N_Chargers', columns='Strategy', values=['population_served', 'unmet_demand_captured'])

# Calculate Efficiency Gains
df_pivot['Pop_Gain_Pct'] = (df_pivot[('population_served', 'Smart')] - df_pivot[('population_served', 'Baseline')]) / df_pivot[('population_served', 'Baseline')] * 100
df_pivot['Demand_Gain_Pct'] = (df_pivot[('unmet_demand_captured', 'Smart')] - df_pivot[('unmet_demand_captured', 'Baseline')]) / df_pivot[('unmet_demand_captured', 'Baseline')] * 100

display(df_pivot)

# Save Results
# 1. Scenarios File
df_locs_export = pd.DataFrame(locations_export)
df_locs_export.to_csv('../data/processed/tableau_scenarios.csv', index=False)

# 2. KPIs File
df_kpis = df_res.copy()
# Just simple flattening for Tableau
df_kpis['Scenario_ID'] = df_kpis['Strategy'] + '_' + df_kpis['N_Chargers'].astype(str)
df_kpis.to_csv('../data/processed/tableau_kpis.csv', index=False)

# 3. Master Barrio File (ensure it has metrics)
gdf_export = gdf.drop(columns=['geometry', 'centroid'], errors='ignore') # simple CSV for data attributes
gdf_export.to_csv('../data/processed/tableau_barrios_master.csv', index=False)

print("Exported CSVs for Tableau.")

Unnamed: 0_level_0,population_served,population_served,unmet_demand_captured,unmet_demand_captured,Pop_Gain_Pct,Demand_Gain_Pct
Strategy,Baseline,Smart,Baseline,Smart,Unnamed: 5_level_1,Unnamed: 6_level_1
N_Chargers,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
10,117510.0,150517.0,205.942598,377.498803,28.088673,83.302923
25,295684.0,372195.0,551.63579,886.587708,25.875935,60.719758
50,485927.0,655681.0,943.27884,1382.646539,34.934054,46.578772


Exported CSVs for Tableau.
