# Dublin Bus Real-Time Data Analysis

This notebook analyzes real-time bus data collected from Transport for Ireland's GTFS-Realtime API.

**Data Sources:**
- Vehicle positions (GPS coordinates)
- Trip updates (delays)

**Operators covered:** Dublin Bus, Bus Ã‰ireann, Go-Ahead Ireland

In [None]:
import sqlite3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import folium
from folium.plugins import HeatMap, MarkerCluster
from datetime import datetime, timedelta
import sys
sys.path.append('../src')
from config import DATABASE_PATH

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## 1. Load Data

In [None]:
# Connect to database
conn = sqlite3.connect(DATABASE_PATH)

# Load vehicle positions
df_positions = pd.read_sql("""
    SELECT * FROM vehicle_positions 
    ORDER BY collected_at DESC
""", conn)

# Load trip updates
df_updates = pd.read_sql("""
    SELECT * FROM trip_updates 
    ORDER BY collected_at DESC
""", conn)

conn.close()

print(f"Vehicle positions: {len(df_positions):,} records")
print(f"Trip updates: {len(df_updates):,} records")

In [None]:
# Data overview
df_positions.head()

In [None]:
# Convert timestamps
df_positions['collected_at'] = pd.to_datetime(df_positions['collected_at'])
df_updates['collected_at'] = pd.to_datetime(df_updates['collected_at'])

# Basic stats
print(f"Data collection period: {df_positions['collected_at'].min()} to {df_positions['collected_at'].max()}")
print(f"Unique vehicles: {df_positions['vehicle_id'].nunique()}")
print(f"Unique routes: {df_positions['route_id'].nunique()}")

## 2. Live Bus Map

Interactive map showing all bus positions at the most recent snapshot.

In [None]:
# Get latest positions for each vehicle
latest_positions = df_positions.sort_values('collected_at').groupby('vehicle_id').last().reset_index()

# Create map centered on Dublin
m = folium.Map(location=[53.3498, -6.2603], zoom_start=11, tiles='cartodbpositron')

# Add marker cluster
marker_cluster = MarkerCluster().add_to(m)

# Add markers for each bus
for _, row in latest_positions.iterrows():
    if pd.notna(row['latitude']) and pd.notna(row['longitude']):
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=5,
            color='blue',
            fill=True,
            fill_opacity=0.7,
            popup=f"Vehicle: {row['vehicle_id']}<br>Route: {row['route_id']}"
        ).add_to(marker_cluster)

m

## 3. Bus Density Heatmap

Shows areas with highest bus activity.

In [None]:
# Create heatmap
heat_data = latest_positions[['latitude', 'longitude']].dropna().values.tolist()

m_heat = folium.Map(location=[53.3498, -6.2603], zoom_start=11, tiles='cartodbdark_matter')
HeatMap(heat_data, radius=15, blur=10, max_zoom=13).add_to(m_heat)

m_heat

## 4. Fleet Activity Over Time

In [None]:
# Count unique vehicles per collection cycle
activity = df_positions.groupby('collected_at')['vehicle_id'].nunique().reset_index()
activity.columns = ['time', 'active_vehicles']

fig = px.line(
    activity, 
    x='time', 
    y='active_vehicles',
    title='Active Buses Over Time',
    labels={'active_vehicles': 'Number of Active Buses', 'time': 'Time'}
)
fig.update_layout(template='plotly_dark')
fig.show()

## 5. Delay Analysis

In [None]:
if len(df_updates) > 0:
    # Convert delays to minutes
    df_updates['arrival_delay_mins'] = df_updates['arrival_delay'] / 60
    df_updates['departure_delay_mins'] = df_updates['departure_delay'] / 60
    
    # Delay distribution
    fig = px.histogram(
        df_updates, 
        x='arrival_delay_mins',
        nbins=50,
        title='Distribution of Arrival Delays',
        labels={'arrival_delay_mins': 'Delay (minutes)'}
    )
    fig.update_layout(template='plotly_dark')
    fig.show()
    
    # Stats
    print(f"\nDelay Statistics:")
    print(f"Average delay: {df_updates['arrival_delay_mins'].mean():.1f} minutes")
    print(f"Median delay: {df_updates['arrival_delay_mins'].median():.1f} minutes")
    print(f"Max delay: {df_updates['arrival_delay_mins'].max():.1f} minutes")
    print(f"On-time (< 1 min delay): {(df_updates['arrival_delay_mins'].abs() < 1).mean()*100:.1f}%")
else:
    print("No delay data available yet")

## 6. Route Analysis

In [None]:
# Most active routes
route_counts = df_positions.groupby('route_id').size().sort_values(ascending=False).head(20)

fig = px.bar(
    x=route_counts.index, 
    y=route_counts.values,
    title='Top 20 Most Active Routes',
    labels={'x': 'Route ID', 'y': 'Number of Position Updates'}
)
fig.update_layout(template='plotly_dark', xaxis_tickangle=-45)
fig.show()

## 7. Summary Dashboard

In [None]:
# Create summary dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Active Vehicles Over Time', 'Top Routes', 
                    'Delay Distribution', 'Geographic Coverage'),
    specs=[[{"type": "scatter"}, {"type": "bar"}],
           [{"type": "histogram"}, {"type": "scattergeo"}]]
)

# Active vehicles
fig.add_trace(
    go.Scatter(x=activity['time'], y=activity['active_vehicles'], mode='lines', name='Active Buses'),
    row=1, col=1
)

# Top routes
fig.add_trace(
    go.Bar(x=route_counts.index[:10], y=route_counts.values[:10], name='Route Activity'),
    row=1, col=2
)

# Delay distribution (if available)
if len(df_updates) > 0:
    fig.add_trace(
        go.Histogram(x=df_updates['arrival_delay_mins'], nbinsx=30, name='Delays'),
        row=2, col=1
    )

fig.update_layout(
    height=800, 
    title_text="Dublin Bus Real-Time Analytics Dashboard",
    template='plotly_dark',
    showlegend=False
)

fig.show()

## 8. Export Summary Stats

In [None]:
# Summary statistics
summary = {
    'total_position_records': len(df_positions),
    'total_update_records': len(df_updates),
    'unique_vehicles': df_positions['vehicle_id'].nunique(),
    'unique_routes': df_positions['route_id'].nunique(),
    'data_start': str(df_positions['collected_at'].min()),
    'data_end': str(df_positions['collected_at'].max()),
    'avg_delay_mins': df_updates['arrival_delay_mins'].mean() if len(df_updates) > 0 else None,
    'on_time_percentage': (df_updates['arrival_delay_mins'].abs() < 1).mean()*100 if len(df_updates) > 0 else None
}

print("\n" + "="*50)
print("DUBLIN BUS DATA SUMMARY")
print("="*50)
for key, value in summary.items():
    print(f"{key}: {value}")