# Accra Public Transport Analysis

This notebook demonstrates the AI analysis capabilities for optimizing public transport in Accra, Ghana.

## Objectives
- Analyze passenger demand patterns
- Optimize route frequencies
- Identify efficiency improvements
- Generate actionable insights for city planners

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Data Loading and Preparation

In [None]:
# Sample GTFS-like data for Accra
stops_data = {
    'stop_id': ['ST001', 'ST002', 'ST003', 'ST004', 'ST005', 'ST006', 'ST007', 'ST008', 'ST009', 'ST010'],
    'stop_name': ['Kwame Nkrumah Circle', 'Kaneshie Market', 'Achimota Station', 'Tema Station', 
                 'Madina Market', 'Lapaz', 'Dansoman', 'Airport', 'University of Ghana', 'Nungua'],
    'stop_lat': [5.5600, 5.5450, 5.6050, 5.6700, 5.6800, 5.6000, 5.5300, 5.6050, 5.6500, 5.5900],
    'stop_lon': [-0.2000, -0.2300, -0.2200, -0.0800, -0.1600, -0.2500, -0.2800, -0.1700, -0.1900, -0.0600],
    'daily_passengers': [15000, 12000, 8000, 10000, 7000, 6000, 5000, 9000, 4000, 3000]
}

routes_data = {
    'route_id': ['R001', 'R002', 'R003', 'R004', 'R005'],
    'route_name': ['Circle-Kaneshie', 'Achimota-Tema', 'Madina-Airport', 'Lapaz-Nungua', 'Dansoman-UG'],
    'start_stop': ['ST001', 'ST003', 'ST005', 'ST006', 'ST007'],
    'end_stop': ['ST002', 'ST004', 'ST008', 'ST010', 'ST009'],
    'current_frequency': [5, 8, 12, 15, 10],  # minutes
    'vehicle_capacity': [80, 60, 50, 40, 70],
    'avg_travel_time': [45, 90, 60, 75, 85],  # minutes
    'efficiency_score': [0.7, 0.6, 0.8, 0.5, 0.65]
}

stops_df = pd.DataFrame(stops_data)
routes_df = pd.DataFrame(routes_data)

print("Bus Stops Data:")
display(stops_df)
print("\nRoutes Data:")
display(routes_df)

## 2. Demand Pattern Analysis

In [None]:
# Visualize passenger demand
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Bar chart of daily passengers
ax1.bar(stops_df['stop_name'], stops_df['daily_passengers'], color='skyblue', alpha=0.8)
ax1.set_title('Daily Passengers by Bus Stop', fontsize=14, fontweight='bold')
ax1.set_xlabel('Bus Stops')
ax1.set_ylabel('Daily Passengers')
ax1.tick_params(axis='x', rotation=45)

# Scatter plot of location vs demand
scatter = ax2.scatter(stops_df['stop_lon'], stops_df['stop_lat'], 
                     s=stops_df['daily_passengers']/50, 
                     c=stops_df['daily_passengers'], 
                     cmap='viridis', alpha=0.7)
ax2.set_title('Bus Stop Locations by Demand', fontsize=14, fontweight='bold')
ax2.set_xlabel('Longitude')
ax2.set_ylabel('Latitude')
plt.colorbar(scatter, ax=ax2, label='Daily Passengers')

plt.tight_layout()
plt.show()

## 3. ML-Based Demand Clustering

In [None]:
# Prepare features for clustering
features = stops_df[['stop_lat', 'stop_lon', 'daily_passengers']].copy()

# Add synthetic time-based features
np.random.seed(42)
features['hour_peak_factor'] = np.random.uniform(0.8, 1.5, len(features))
features['weekend_factor'] = np.random.uniform(0.6, 1.2, len(features))

# Standardize features for clustering
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features[['daily_passengers', 'hour_peak_factor']])

# Perform K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
stops_df['demand_cluster'] = kmeans.fit_predict(scaled_features)

# Visualize clusters
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
scatter = plt.scatter(stops_df['daily_passengers'], stops_df.index, 
                     c=stops_df['demand_cluster'], cmap='Set1', s=100, alpha=0.8)
plt.title('Demand Clusters', fontsize=14, fontweight='bold')
plt.xlabel('Daily Passengers')
plt.ylabel('Stop Index')
plt.colorbar(scatter, label='Cluster')

plt.subplot(1, 2, 2)
cluster_counts = stops_df['demand_cluster'].value_counts().sort_index()
plt.pie(cluster_counts.values, labels=[f'Cluster {i}' for i in cluster_counts.index], 
        autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Bus Stops by Cluster', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nCluster Analysis:")
for cluster in sorted(stops_df['demand_cluster'].unique()):
    cluster_stops = stops_df[stops_df['demand_cluster'] == cluster]
    avg_demand = cluster_stops['daily_passengers'].mean()
    print(f"Cluster {cluster}: {len(cluster_stops)} stops, Avg daily passengers: {avg_demand:.0f}")
    print(f"  Stops: {', '.join(cluster_stops['stop_name'].tolist())}")

## 4. Route Optimization Analysis

In [None]:
def optimize_route_frequency(route_data, stops_data):
    """Optimize route frequency based on demand"""
    optimization_results = []
    
    for _, route in route_data.iterrows():
        # Get demand for start and end stops
        start_demand = stops_data[stops_data['stop_id'] == route['start_stop']]['daily_passengers'].iloc[0]
        end_demand = stops_data[stops_data['stop_id'] == route['end_stop']]['daily_passengers'].iloc[0]
        avg_demand = (start_demand + end_demand) / 2
        
        # Calculate optimal frequency (simple heuristic)
        optimal_frequency = max(3, min(20, int(avg_demand / 1000)))
        
        # Calculate capacity utilization
        operating_hours = 16  # 6 AM to 10 PM
        current_trips_per_hour = 60 / route['current_frequency']
        optimal_trips_per_hour = 60 / optimal_frequency
        
        current_daily_capacity = route['vehicle_capacity'] * current_trips_per_hour * operating_hours
        optimal_daily_capacity = route['vehicle_capacity'] * optimal_trips_per_hour * operating_hours
        
        current_utilization = avg_demand / current_daily_capacity
        optimal_utilization = avg_demand / optimal_daily_capacity
        
        efficiency_gain = (optimal_utilization - current_utilization) * 100
        
        optimization_results.append({
            'route_name': route['route_name'],
            'current_frequency': route['current_frequency'],
            'optimal_frequency': optimal_frequency,
            'current_utilization': round(current_utilization * 100, 1),
            'optimal_utilization': round(optimal_utilization * 100, 1),
            'efficiency_gain': round(efficiency_gain, 1),
            'avg_demand': avg_demand
        })
    
    return pd.DataFrame(optimization_results)

# Perform optimization
optimization_df = optimize_route_frequency(routes_df, stops_df)

print("Route Optimization Results:")
display(optimization_df)

# Visualize optimization results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Frequency comparison
x = np.arange(len(optimization_df))
width = 0.35

ax1.bar(x - width/2, optimization_df['current_frequency'], width, 
        label='Current', alpha=0.8, color='lightcoral')
ax1.bar(x + width/2, optimization_df['optimal_frequency'], width, 
        label='Optimal', alpha=0.8, color='lightblue')
ax1.set_title('Frequency Optimization', fontsize=14, fontweight='bold')
ax1.set_xlabel('Routes')
ax1.set_ylabel('Frequency (minutes)')
ax1.set_xticks(x)
ax1.set_xticklabels(optimization_df['route_name'], rotation=45)
ax1.legend()

# Efficiency gains
colors = ['green' if x > 0 else 'red' if x < 0 else 'gray' for x in optimization_df['efficiency_gain']]
ax2.bar(optimization_df['route_name'], optimization_df['efficiency_gain'], 
        color=colors, alpha=0.8)
ax2.set_title('Efficiency Gains by Route', fontsize=14, fontweight='bold')
ax2.set_xlabel('Routes')
ax2.set_ylabel('Efficiency Gain (%)')
ax2.tick_params(axis='x', rotation=45)
ax2.axhline(y=0, color='black', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

print(f"\nOverall Analysis:")
print(f"Average efficiency gain: {optimization_df['efficiency_gain'].mean():.1f}%")
print(f"Routes with positive gains: {len(optimization_df[optimization_df['efficiency_gain'] > 0])}")
print(f"Total daily passengers affected: {optimization_df['avg_demand'].sum():.0f}")

## 5. Interactive Network Visualization

In [None]:
# Create interactive map
def create_transport_map(stops_df, routes_df):
    # Create base map centered on Accra
    center_lat = stops_df['stop_lat'].mean()
    center_lon = stops_df['stop_lon'].mean()
    m = folium.Map(location=[center_lat, center_lon], zoom_start=11)
    
    # Add stops to map
    colors = ['red', 'blue', 'green']
    for _, stop in stops_df.iterrows():
        color = colors[stop['demand_cluster']]
        
        folium.CircleMarker(
            location=[stop['stop_lat'], stop['stop_lon']],
            radius=stop['daily_passengers'] / 1000,
            popup=f"""
            <b>{stop['stop_name']}</b><br>
            Daily Passengers: {stop['daily_passengers']:,}<br>
            Demand Cluster: {stop['demand_cluster']}
            """,
            color=color,
            fill=True,
            fillColor=color,
            fillOpacity=0.6
        ).add_to(m)
    
    # Add routes as lines
    for _, route in routes_df.iterrows():
        start_stop = stops_df[stops_df['stop_id'] == route['start_stop']].iloc[0]
        end_stop = stops_df[stops_df['stop_id'] == route['end_stop']].iloc[0]
        
        folium.PolyLine(
            locations=[[start_stop['stop_lat'], start_stop['stop_lon']], 
                      [end_stop['stop_lat'], end_stop['stop_lon']]],
            popup=f"""
            <b>{route['route_name']}</b><br>
            Current Frequency: {route['current_frequency']} min<br>
            Travel Time: {route['avg_travel_time']} min<br>
            Efficiency Score: {route['efficiency_score']:.2f}
            """,
            color='purple',
            weight=3,
            opacity=0.8
        ).add_to(m)
    
    # Add legend
    legend_html = '''
    <div style="position: fixed; 
                bottom: 50px; left: 50px; width: 150px; height: 90px; 
                background-color: white; border:2px solid grey; z-index:9999; 
                font-size:14px; padding: 10px">
    <h4>Legend</h4>
    <p><i class="fa fa-circle" style="color:red"></i> High Demand</p>
    <p><i class="fa fa-circle" style="color:blue"></i> Medium Demand</p>
    <p><i class="fa fa-circle" style="color:green"></i> Low Demand</p>
    </div>
    '''
    m.get_root().html.add_child(folium.Element(legend_html))
    
    return m

# Create and display map
transport_map = create_transport_map(stops_df, routes_df)
transport_map

## 6. Predictive Modeling for Demand Forecasting

In [None]:
# Train demand prediction model
features_for_model = features[['stop_lat', 'stop_lon', 'hour_peak_factor', 'weekend_factor']]
target = features['daily_passengers']

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(features_for_model, target)

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features_for_model.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
plt.title('Feature Importance for Demand Prediction', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

print("Feature Importance for Demand Prediction:")
display(feature_importance)

# Predict demand for existing stops
predictions = rf_model.predict(features_for_model)
prediction_accuracy = np.corrcoef(target, predictions)[0, 1]
print(f"\nModel Accuracy (Correlation): {prediction_accuracy:.3f}")

## 7. Key Insights and Recommendations

In [None]:
def generate_insights(stops_df, optimization_df):
    insights = []
    
    # 1. Route optimization insights
    total_efficiency_gain = optimization_df['efficiency_gain'].sum()
    avg_efficiency_gain = optimization_df['efficiency_gain'].mean()
    
    insights.append({
        'category': 'Route Optimization',
        'insight': f'Average efficiency gain of {avg_efficiency_gain:.1f}% achievable through frequency optimization',
        'action': 'Implement dynamic frequency adjustment based on demand patterns',
        'impact': 'High'
    })
    
    # 2. Most underutilized route
    underutilized = optimization_df.loc[optimization_df['current_utilization'].idxmin()]
    insights.append({
        'category': 'Resource Allocation',
        'insight': f'{underutilized["route_name"]} has only {underutilized["current_utilization"]}% capacity utilization',
        'action': f'Reduce frequency from {underutilized["current_frequency"]} to {underutilized["optimal_frequency"]} minutes',
        'impact': 'Medium'
    })
    
    # 3. High demand stops
    high_demand_stops = stops_df[stops_df['demand_cluster'] == stops_df['demand_cluster'].max()]
    insights.append({
        'category': 'Infrastructure Priority',
        'insight': f'High-demand stops: {", ".join(high_demand_stops["stop_name"].tolist())}',
        'action': 'Prioritize infrastructure improvements and increased service frequency',
        'impact': 'High'
    })
    
    # 4. Network connectivity
    total_daily_passengers = stops_df['daily_passengers'].sum()
    insights.append({
        'category': 'Network Analysis',
        'insight': f'Network serves {total_daily_passengers:,} daily passengers across {len(stops_df)} stops',
        'action': 'Consider adding express routes between high-demand stops',
        'impact': 'Medium'
    })
    
    return pd.DataFrame(insights)

# Generate insights
insights_df = generate_insights(stops_df, optimization_df)

print("=" * 60)
print("KEY INSIGHTS AND RECOMMENDATIONS")
print("=" * 60)

for i, insight in insights_df.iterrows():
    print(f"\n{i+1}. {insight['category']} ({insight['impact']} Impact)")
    print(f"   Insight: {insight['insight']}")
    print(f"   Action: {insight['action']}")

print("\n" + "=" * 60)
print("SUMMARY STATISTICS")
print("=" * 60)
print(f"• Total daily passengers: {stops_df['daily_passengers'].sum():,}")
print(f"• Average passengers per stop: {stops_df['daily_passengers'].mean():.0f}")
print(f"• Routes analyzed: {len(routes_df)}")
print(f"• Average potential efficiency gain: {optimization_df['efficiency_gain'].mean():.1f}%")
print(f"• Stops requiring immediate attention: {len(stops_df[stops_df['daily_passengers'] > 10000])}")

## 8. Implementation Roadmap

### Phase 1: Immediate Actions (0-3 months)
1. **Frequency Optimization**: Implement optimized frequencies for routes showing >5% efficiency gains
2. **High-Demand Stop Improvements**: Upgrade infrastructure at Circle, Kaneshie, and Tema stations
3. **Data Collection Enhancement**: Deploy passenger counting systems and GPS tracking

### Phase 2: Medium-term Improvements (3-12 months)
1. **Dynamic Scheduling**: Implement AI-driven dynamic frequency adjustment
2. **Route Network Expansion**: Add express routes between high-demand clusters
3. **Real-time Information Systems**: Deploy passenger information displays

### Phase 3: Long-term Vision (1-3 years)
1. **Smart Transport Integration**: Integrate with ride-sharing and last-mile solutions
2. **Predictive Maintenance**: AI-powered vehicle maintenance scheduling
3. **Carbon Footprint Optimization**: Route optimization for minimal emissions

### Success Metrics
- **Efficiency**: >20% improvement in capacity utilization
- **Passenger Satisfaction**: >15% reduction in wait times
- **Environmental Impact**: >10% reduction in carbon emissions per passenger
- **Cost Optimization**: >12% reduction in operational costs