In [None]:
import json
import pandas as pd
import numpy as np
import requests
from pathlib import Path
from datetime import datetime, timedelta
# %% [markdown]
## Step 1: Fetch Subway Data from API

# First, we'll call the API endpoint to export subway stop data to a JSON file.

In [None]:
# Call the API endpoint (make sure server is running)
API_BASE_URL = 'http://localhost:8000'

try:
    response = requests.post(f'{API_BASE_URL}/api/mbta/export-subway-data')
    if response.status_code == 200:
        result = response.json()
        print(f"Successfully exported data for {result['total_routes']} routes")
        print(f"Routes: {', '.join(result['routes'])}")
        data_file = Path('subway_stops_data.json')
    else:
        print(f"Error: {response.status_code}")
except requests.exceptions.ConnectionError:
    print("Server not running. Please start the FastAPI server with: uvicorn main:app --reload")
# %% [markdown]
## Step 2: Load and Explore Subway Data

In [None]:
# Load the subway data
with open('subway_stops_data.json', 'r') as f:
    subway_data = json.load(f)

print(f"Total routes: {len(subway_data['routes'])}")
print(f"\nRoutes:")
for route in subway_data['routes']:
    route_id = route['id']
    stop_count = len(subway_data['stops'][route_id])
    print(f"  - {route['name']} ({route_id}): {stop_count} stops")
# %% [markdown]
## Step 3: Generate Synthetic Rider Data

# Generate realistic rider counts for each stop based on:
# - Time of day (rush hour vs off-peak)
# - Day of week (weekday vs weekend)
# - Random variation to simulate real-world patterns

In [None]:
def generate_rider_data(subway_data, hours=24, days=7):
    """Generate synthetic rider data for all stops"""
    
    # Time multipliers for different hours (rush hour = higher traffic)
    hour_multipliers = {
        5: 0.6, 6: 0.8,
        7: 1.5, 8: 2.0, 9: 1.5,
        10: 0.8, 11: 0.9, 12: 1.0, 13: 0.9, 14: 0.8, 15: 0.9,
        16: 1.3, 17: 1.8, 18: 2.0, 19: 1.4,
        20: 0.7, 21: 0.5, 22: 0.4, 23: 0.3,
        0: 0.2, 1: 0.1, 2: 0.1, 3: 0.1, 4: 0.3
    }
    
    # Day of week multipliers (0=Monday, 6=Sunday)
    day_multipliers = [1.0, 1.0, 1.0, 1.0, 1.1, 0.7, 0.6]
    
    all_stops_data = []
    
    for route in subway_data['routes']:
        route_id = route['id']
        route_name = route['name']
        stops = subway_data['stops'][route_id]
        
        for stop in stops:
            base_riders = np.random.randint(50, 300)
            
            for day in range(days):
                for hour in range(hours):
                    time_mult = hour_multipliers.get(hour, 0.5)
                    day_mult = day_multipliers[day % 7]
                    random_factor = np.random.uniform(0.8, 1.2)
                    riders = int(base_riders * time_mult * day_mult * random_factor)
                    timestamp = datetime.now() - timedelta(days=days-day-1, hours=hours-hour-1)
                    
                    all_stops_data.append({
                        'stop_id': stop['id'],
                        'stop_name': stop['name'],
                        'route_id': route_id,
                        'route_name': route_name,
                        'latitude': stop['latitude'],
                        'longitude': stop['longitude'],
                        'timestamp': timestamp.isoformat(),
                        'hour': hour,
                        'day_of_week': day % 7,
                        'rider_count': riders,
                        'is_rush_hour': hour in [7, 8, 9, 17, 18, 19],
                        'is_weekend': (day % 7) in [5, 6]
                    })
    
    return pd.DataFrame(all_stops_data)

# Generate the data
print("Generating rider data...")
rider_df = generate_rider_data(subway_data, hours=24, days=7)
print(f"Generated {len(rider_df):,} data points")
print(f"\nDataset shape: {rider_df.shape}")
rider_df.head(10)
# %% [markdown]
## Step 4: Data Analysis and Statistics

In [None]:
# Summary statistics
print("=== Rider Count Statistics ===")
print(rider_df['rider_count'].describe())

print("\n=== Average Riders by Hour ===")
hourly_avg = rider_df.groupby('hour')['rider_count'].mean().round(0)
print(hourly_avg)

print("\n=== Busiest Stops (Total Riders) ===")
busiest_stops = rider_df.groupby(['stop_name', 'route_name'])['rider_count'].sum().sort_values(ascending=False).head(10)
print(busiest_stops)

print("\n=== Rush Hour vs Off-Peak ===")
print(rider_df.groupby('is_rush_hour')['rider_count'].mean().round(0))

print("\n=== Weekday vs Weekend ===")
print(rider_df.groupby('is_weekend')['rider_count'].mean().round(0))
# %% [markdown]
## Step 5: Export Data

In [None]:
# Export to CSV
output_csv = 'mbta_rider_data.csv'
rider_df.to_csv(output_csv, index=False)
print(f"Exported to {output_csv}")

# Export to JSON
output_json = 'mbta_rider_data.json'
rider_df.to_json(output_json, orient='records', indent=2)
print(f"Exported to {output_json}")

# Create a summary snapshot
current_snapshot = rider_df.groupby(['stop_id', 'stop_name', 'route_id', 'route_name', 'latitude', 'longitude']).agg({
    'rider_count': 'mean'
}).round(0).reset_index()

current_snapshot.columns = ['stop_id', 'stop_name', 'route_id', 'route_name', 'latitude', 'longitude', 'avg_riders']

output_snapshot = 'mbta_rider_snapshot.json'
with open(output_snapshot, 'w') as f:
    json.dump({
        'timestamp': datetime.now().isoformat(),
        'stops': current_snapshot.to_dict('records')
    }, f, indent=2)

print(f"Exported snapshot to {output_snapshot}")
print(f"\nTotal files created: 3")