# 01 Data Preprocessing & H3 Aggregation
## Privacy-Preserving Geospatial Processing

This notebook implements H3 hexagonal binning for privacy-preserving geospatial aggregation.

In [None]:
import pandas as pd
import numpy as np
import h3
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print("H3 Preprocessing Pipeline - Ready!")
print(f"H3 version: {h3.__version__}")

### Load Processed Data

In [None]:
# Load data from previous step
df = pd.read_csv("../data/processed/01_ingested_trips.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])

print(f"Loaded {len(df):,} trips for H3 processing")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

### H3 Aggregation Functions

In [None]:
def add_h3_hexagons(df: pd.DataFrame, resolution: int = 9) -> pd.DataFrame:
    """Add H3 hexagon IDs to trip data"""
    
    print(f"Converting coordinates to H3 hexagons (resolution {resolution})")
    
    df_h3 = df.copy()
    
    # Convert coordinates to H3 hexagons
    df_h3['h3_hex'] = df_h3.apply(
        lambda row: h3.geo_to_h3(row['lat'], row['lon'], resolution), 
        axis=1
    )
    
    # Add temporal binning for privacy
    df_h3['hour'] = df_h3['timestamp'].dt.hour
    df_h3['day_of_week'] = df_h3['timestamp'].dt.dayofweek
    df_h3['date'] = df_h3['timestamp'].dt.date
    
    print(f"Created {df_h3['h3_hex'].nunique():,} unique H3 hexagons")
    
    return df_h3

def apply_k_anonymity(df: pd.DataFrame, k: int = 5) -> pd.DataFrame:
    """Apply k-anonymity by filtering out bins with < k trips"""
    
    print(f"Applying k-anonymity (k={k})")
    
    # Create spatio-temporal bins
    df['st_bin'] = df['h3_hex'].astype(str) + '_' + df['hour'].astype(str)
    
    # Count trips per bin
    bin_counts = df['st_bin'].value_counts()
    
    # Keep only bins with k or more trips
    valid_bins = bin_counts[bin_counts >= k].index
    df_anonymous = df[df['st_bin'].isin(valid_bins)].copy()
    
    removed_count = len(df) - len(df_anonymous)
    removed_pct = (removed_count / len(df)) * 100
    
    print(f"Removed {removed_count:,} trips ({removed_pct:.1f}%) for k-anonymity")
    print(f"Remaining: {len(df_anonymous):,} trips in {len(valid_bins):,} valid bins")
    
    return df_anonymous.drop('st_bin', axis=1)

# Apply H3 aggregation
H3_RESOLUTION = 9  # ~150m hexagons
K_ANONYMITY = 5

df_h3 = add_h3_hexagons(df, resolution=H3_RESOLUTION)
df_anonymous = apply_k_anonymity(df_h3, k=K_ANONYMITY)

### Privacy Analysis

In [None]:
def analyze_privacy_metrics(df_original: pd.DataFrame, df_anonymous: pd.DataFrame):
    """Analyze privacy preservation metrics"""
    
    print("\n=== PRIVACY ANALYSIS ===")
    
    # Coverage metrics
    original_coverage = len(df_original)
    anonymous_coverage = len(df_anonymous)
    coverage_rate = (anonymous_coverage / original_coverage) * 100
    
    print(f"Data Coverage: {coverage_rate:.1f}% ({anonymous_coverage:,} / {original_coverage:,} trips)")
    
    # Spatial resolution
    hex_area_km2 = h3.hex_area(H3_RESOLUTION, unit='km^2')
    hex_edge_length = h3.edge_length(H3_RESOLUTION, unit='m')
    
    print(f"H3 Resolution {H3_RESOLUTION}: {hex_area_km2:.4f} km² per hexagon")
    print(f"Edge length: {hex_edge_length:.1f}m")
    
    # Anonymity distribution
    st_bins = df_anonymous['h3_hex'].astype(str) + '_' + df_anonymous['hour'].astype(str)
    bin_sizes = st_bins.value_counts()
    
    print(f"\nAnonymity Distribution:")
    print(f"Min trips per bin: {bin_sizes.min()}")
    print(f"Max trips per bin: {bin_sizes.max()}")
    print(f"Avg trips per bin: {bin_sizes.mean():.1f}")
    print(f"Median trips per bin: {bin_sizes.median():.1f}")
    
    # Geographic spread
    unique_hexes = df_anonymous['h3_hex'].nunique()
    total_possible_hexes = len(set(df_original.apply(lambda row: h3.geo_to_h3(row['lat'], row['lon'], H3_RESOLUTION), axis=1)))
    
    print(f"\nSpatial Coverage:")
    print(f"Active hexagons: {unique_hexes:,}")
    print(f"Total hexagons: {total_possible_hexes:,}")
    print(f"Hex utilization: {(unique_hexes/total_possible_hexes)*100:.1f}%")
    
    return {
        'coverage_rate': coverage_rate,
        'hex_area_km2': hex_area_km2,
        'anonymity_stats': bin_sizes.describe(),
        'spatial_coverage': unique_hexes
    }

privacy_metrics = analyze_privacy_metrics(df_h3, df_anonymous)

### H3 Visualization

In [None]:
# Visualize H3 hexagons and privacy preservation
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Original vs Anonymous data coverage
sample_orig = df_h3.sample(min(1000, len(df_h3)))
sample_anon = df_anonymous.sample(min(1000, len(df_anonymous)))

axes[0, 0].scatter(sample_orig['lon'], sample_orig['lat'], alpha=0.5, s=2, label='Original', color='red')
axes[0, 0].scatter(sample_anon['lon'], sample_anon['lat'], alpha=0.7, s=2, label='Anonymous', color='green')
axes[0, 0].set_xlabel('Longitude')
axes[0, 0].set_ylabel('Latitude')
axes[0, 0].set_title('Data Coverage: Original vs K-Anonymous')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Hexagon trip density
hex_counts = df_anonymous['h3_hex'].value_counts()
axes[0, 1].hist(hex_counts.values, bins=50, alpha=0.7, color='green', edgecolor='black')
axes[0, 1].axvline(K_ANONYMITY, color='red', linestyle='--', label=f'K-anonymity threshold ({K_ANONYMITY})')
axes[0, 1].set_xlabel('Trips per Hexagon')
axes[0, 1].set_ylabel('Number of Hexagons')
axes[0, 1].set_title('Trip Density Distribution by H3 Hexagon')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Temporal-spatial bin sizes
st_bins = df_anonymous['h3_hex'].astype(str) + '_' + df_anonymous['hour'].astype(str)
bin_sizes = st_bins.value_counts()

axes[1, 0].hist(bin_sizes.values, bins=30, alpha=0.7, color='blue', edgecolor='black')
axes[1, 0].axvline(K_ANONYMITY, color='red', linestyle='--', label=f'K-anonymity threshold ({K_ANONYMITY})')
axes[1, 0].set_xlabel('Trips per Spatio-Temporal Bin')
axes[1, 0].set_ylabel('Number of Bins')
axes[1, 0].set_title('K-Anonymity Distribution (Hex + Hour bins)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Hourly privacy preservation
hourly_original = df_h3.groupby('hour').size()
hourly_anonymous = df_anonymous.groupby('hour').size()

x = range(24)
axes[1, 1].bar([i-0.2 for i in x], [hourly_original.get(i, 0) for i in x], 
               width=0.4, label='Original', alpha=0.7, color='red')
axes[1, 1].bar([i+0.2 for i in x], [hourly_anonymous.get(i, 0) for i in x], 
               width=0.4, label='K-Anonymous', alpha=0.7, color='green')

axes[1, 1].set_xlabel('Hour of Day')
axes[1, 1].set_ylabel('Number of Trips')
axes[1, 1].set_title('Hourly Trip Distribution: Original vs K-Anonymous')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("H3 visualization complete!")

### Export Anonymized Data

In [None]:
# Export anonymized data
output_path = "../data/processed/02_h3_anonymous.csv"
df_anonymous.to_csv(output_path, index=False)

# Export privacy metrics
import json
metrics_path = "../data/processed/02_privacy_metrics.json"

# Convert to serializable format
metrics_export = {
    'h3_resolution': H3_RESOLUTION,
    'k_anonymity': K_ANONYMITY,
    'coverage_rate': privacy_metrics['coverage_rate'],
    'hex_area_km2': privacy_metrics['hex_area_km2'],
    'spatial_coverage': privacy_metrics['spatial_coverage'],
    'anonymity_stats': privacy_metrics['anonymity_stats'].to_dict()
}

with open(metrics_path, 'w') as f:
    json.dump(metrics_export, f, indent=2)

print("\n=== PREPROCESSING COMPLETE ====")
print(f"✅ {len(df_anonymous):,} trips anonymized successfully")
print(f"✅ Privacy coverage: {privacy_metrics['coverage_rate']:.1f}%")
print(f"✅ H3 resolution: {H3_RESOLUTION} (~{privacy_metrics['hex_area_km2']*1000000:.0f}m² hexagons)")
print(f"✅ K-anonymity: {K_ANONYMITY} minimum trips per bin")
print(f"✅ Data exported to: {output_path}")
print(f"✅ Metrics saved to: {metrics_path}")
print("\nReady for next step: 02_exploratory.ipynb")