In [12]:
import pandas as pd
import json
from datetime import datetime
import math

# Function to calculate distance between two coordinates (Haversine formula)
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371e3  # Earth's radius in meters
    φ1 = math.radians(lat1)
    φ2 = math.radians(lat2)
    Δφ = math.radians(lat2 - lat1)
    Δλ = math.radians(lon2 - lon1)

    a = math.sin(Δφ/2) * math.sin(Δφ/2) + \
        math.cos(φ1) * math.cos(φ2) * \
        math.sin(Δλ/2) * math.sin(Δλ/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return R * c  # Distance in meters

# Function to calculate speed (meters/second)
def calculate_speed(distance, time_diff):
    if time_diff == 0:
        return 0
    return distance / time_diff

# Load all data files
stops_df = pd.read_csv('stops.txt')
with open('VehiclePositions.json', 'r') as f:
    vehicle_positions = json.load(f)
with open('trip_updates.json', 'r') as f:
    trip_updates = json.load(f)
boarding_points_df = pd.read_csv('Transit_Bus_Boarding_Points.csv')

# Create a list to store our processed data
processed_data = []

# Current time (using the latest timestamp from vehicle positions)
current_time = datetime.fromtimestamp(vehicle_positions['header']['timestamp'])

# Process each vehicle
for vehicle_entity in vehicle_positions['entity']:
    vehicle = vehicle_entity['vehicle']
    bus_id = vehicle_entity['id']
    trip_id = vehicle['trip']['trip_id']
    route_id = vehicle['trip']['route_id']
    current_lat = vehicle['position']['latitude']
    current_lon = vehicle['position']['longitude']
    position_timestamp = datetime.fromtimestamp(vehicle['timestamp'])

    # Find corresponding trip update
    trip_update = next((tu['trip_update'] for tu in trip_updates['entity']
                       if tu['trip_update']['trip']['trip_id'] == trip_id), None)

    if trip_update and 'stop_time_update' in trip_update:
        # Get next stop information
        next_stop_update = trip_update['stop_time_update'][0]  # Assuming first stop is next stop
        next_stop_id = next_stop_update['stop_id']

        # Check if stop exists in stops_df
        stop_match = stops_df[stops_df['stop_id'] == next_stop_id]
        if stop_match.empty:
            print(f"Warning: Stop ID {next_stop_id} not found in stops data. Skipping vehicle {bus_id}")
            continue

        next_stop = stop_match.iloc[0]
        next_stop_lat = next_stop['stop_lat']
        next_stop_lon = next_stop['stop_lon']
        next_stop_name = next_stop['stop_name']
        expected_arrival = datetime.fromtimestamp(next_stop_update['arrival']['time'])

        # Calculate time difference
        time_to_arrival = (expected_arrival - current_time).total_seconds()

        # Calculate distance to next stop
        distance_to_stop = calculate_distance(current_lat, current_lon,
                                           next_stop_lat, next_stop_lon)

        # Calculate speed
        speed = calculate_speed(distance_to_stop, max(1, time_to_arrival))

        # Determine status
        time_diff = (expected_arrival - current_time).total_seconds()
        if time_diff < -60:  # More than 1 minute early
            status = 'early'
        elif time_diff > 60:  # More than 1 minute late
            status = 'late'
        else:
            status = 'on-time'

        # Create record
        record = {
            'bus_id': bus_id,
            'trip_id': trip_id,
            'route_id': route_id,
            'current_lat': current_lat,
            'current_lon': current_lon,
            'next_stop_id': next_stop_id,
            'next_stop_lat': next_stop_lat,
            'next_stop_lon': next_stop_lon,
            'next_stop_name': next_stop_name,
            'current_time': current_time.isoformat(),
            'position_timestamp': position_timestamp.isoformat(),
            'expected_arrival_time': expected_arrival.isoformat(),
            'time_to_arrival_seconds': time_to_arrival,
            'distance_to_stop_meters': distance_to_stop,
            'speed_m_s': speed,
            'status': status,
            'stop_sequence': next_stop_update['stop_sequence'],
            'wheelchair_boarding': next_stop['wheelchair_boarding']
        }

        processed_data.append(record)

# Create DataFrame and save to CSV
result_df = pd.DataFrame(processed_data)
result_df.to_csv('bus_status_dataset.csv', index=False)

print("Dataset created and saved as 'bus_status_dataset.csv'")
print(f"Number of records: {len(result_df)}")
print("\nColumns in the dataset:")
print(result_df.columns.tolist())

Dataset created and saved as 'bus_status_dataset.csv'
Number of records: 61

Columns in the dataset:
['bus_id', 'trip_id', 'route_id', 'current_lat', 'current_lon', 'next_stop_id', 'next_stop_lat', 'next_stop_lon', 'next_stop_name', 'current_time', 'position_timestamp', 'expected_arrival_time', 'time_to_arrival_seconds', 'distance_to_stop_meters', 'speed_m_s', 'status', 'stop_sequence', 'wheelchair_boarding']


In [13]:
import pandas as pd
import json
from datetime import datetime

# Load all data files
stops_df = pd.read_csv('stops.txt')
with open('VehiclePositions.json', 'r') as f:
    vehicle_positions = json.load(f)
with open('trip_updates.json', 'r') as f:
    trip_updates = json.load(f)
boarding_points_df = pd.read_csv('Transit_Bus_Boarding_Points.csv')

# Create a dictionary to count stop visits
stop_visits = {}

# Process trip updates to count planned visits
for entity in trip_updates['entity']:
    trip_update = entity['trip_update']
    route_id = trip_update['trip']['route_id']

    for stop_update in trip_update['stop_time_update']:
        stop_id = stop_update['stop_id']
        if stop_id not in stop_visits:
            stop_visits[stop_id] = {
                'visit_count': 0,
                'routes': set(),
                'arrival_times': []
            }
        stop_visits[stop_id]['visit_count'] += 1
        stop_visits[stop_id]['routes'].add(route_id)
        stop_visits[stop_id]['arrival_times'].append(
            datetime.fromtimestamp(stop_update['arrival']['time']).isoformat()
        )

# Process vehicle positions to count actual visits
# Note: With current data we don't have explicit visit history,
# so we'll use current positions as a proxy
for entity in vehicle_positions['entity']:
    vehicle = entity['vehicle']
    trip_id = vehicle['trip']['trip_id']
    route_id = vehicle['trip']['route_id']

    # Find matching trip update to get current stop
    trip_update = next((tu['trip_update'] for tu in trip_updates['entity']
                       if tu['trip_update']['trip']['trip_id'] == trip_id), None)

    if trip_update and 'stop_time_update' in trip_update:
        next_stop_id = trip_update['stop_time_update'][0]['stop_id']
        if next_stop_id in stop_visits:
            stop_visits[next_stop_id]['visit_count'] += 1
            stop_visits[next_stop_id]['routes'].add(route_id)

# Create processed data list
processed_data = []

for stop_id, stats in stop_visits.items():
    # Try to get stop details from stops_df
    stop_info = stops_df[stops_df['stop_id'] == stop_id]
    if stop_info.empty:
        # If not in stops.txt, create basic info
        record = {
            'stop_id': stop_id,
            'stop_name': f"Stop {stop_id}",
            'latitude': None,
            'longitude': None,
            'visit_count': stats['visit_count'],
            'unique_routes': len(stats['routes']),
            'route_ids': ','.join(stats['routes']),
            'wheelchair_boarding': None,
            'location_type': None,
            'zone_id': None,
            'last_arrival_time': max(stats['arrival_times']) if stats['arrival_times'] else None,
            'first_arrival_time': min(stats['arrival_times']) if stats['arrival_times'] else None,
            'average_visits_per_route': stats['visit_count'] / len(stats['routes']) if stats['routes'] else 0
        }
    else:
        stop = stop_info.iloc[0]
        record = {
            'stop_id': stop_id,
            'stop_name': stop['stop_name'],
            'latitude': stop['stop_lat'],
            'longitude': stop['stop_lon'],
            'visit_count': stats['visit_count'],
            'unique_routes': len(stats['routes']),
            'route_ids': ','.join(stats['routes']),
            'wheelchair_boarding': stop['wheelchair_boarding'],
            'location_type': stop['location_type'],
            'zone_id': stop['zone_id'],
            'last_arrival_time': max(stats['arrival_times']) if stats['arrival_times'] else None,
            'first_arrival_time': min(stats['arrival_times']) if stats['arrival_times'] else None,
            'average_visits_per_route': stats['visit_count'] / len(stats['routes']) if stats['routes'] else 0
        }

    # Try to add boarding point info if available
    boarding_info = boarding_points_df[boarding_points_df['NUMBER'] == int(stop_id) if stop_id.isdigit() else False]
    if not boarding_info.empty:
        boarding = boarding_info.iloc[0]
        record.update({
            'accessible': boarding['ACCESSIBLE'],
            'shelter': boarding['SHELTER'],
            'city': boarding['CITY'],
            'status': boarding['STATUS']
        })

    processed_data.append(record)

# Create DataFrame and save to CSV
result_df = pd.DataFrame(processed_data)
result_df = result_df.sort_values('visit_count', ascending=False)  # Sort by popularity
result_df.to_csv('stop_popularity_dataset.csv', index=False)

print("Dataset created and saved as 'stop_popularity_dataset.csv'")
print(f"Number of stops: {len(result_df)}")
print("\nColumns in the dataset:")
print(result_df.columns.tolist())
print("\nTop 5 most popular stops:")
print(result_df[['stop_id', 'stop_name', 'visit_count', 'unique_routes']].head())

Dataset created and saved as 'stop_popularity_dataset.csv'
Number of stops: 1450

Columns in the dataset:
['stop_id', 'stop_name', 'latitude', 'longitude', 'visit_count', 'unique_routes', 'route_ids', 'wheelchair_boarding', 'location_type', 'zone_id', 'last_arrival_time', 'first_arrival_time', 'average_visits_per_route', 'accessible', 'shelter', 'city', 'status']

Top 5 most popular stops:
    stop_id                                stop_name  visit_count  \
58     2595                   Oshawa Centre Terminal           50   
780    2569                             Ajax Station           23   
15     2576                           Whitby Station           21   
33    93112  Pickering Parkway Terminal Platform B12           20   
57    93026      Simcoe Southbound @ Windfields Farm           20   

     unique_routes  
58               8  
780              4  
15               3  
33               4  
57               5  


In [14]:
import pandas as pd
import json
from datetime import datetime

# Load all data files
stops_df = pd.read_csv('stops.txt')
with open('VehiclePositions.json', 'r') as f:
    vehicle_positions = json.load(f)
with open('trip_updates.json', 'r') as f:
    trip_updates = json.load(f)
boarding_points_df = pd.read_csv('Transit_Bus_Boarding_Points.csv')

# Create a dictionary to track route statistics
route_stats = {}

# Process trip updates to count trips and stops
for entity in trip_updates['entity']:
    trip_update = entity['trip_update']
    route_id = trip_update['trip']['route_id']
    trip_id = trip_update['trip']['trip_id']
    start_date = trip_update['trip']['start_date']

    if route_id not in route_stats:
        route_stats[route_id] = {
            'trip_count': 0,
            'stop_count': 0,
            'unique_stops': set(),
            'trip_ids': set(),
            'start_dates': set(),
            'arrival_times': []
        }

    route_stats[route_id]['trip_count'] += 1
    route_stats[route_id]['trip_ids'].add(trip_id)
    route_stats[route_id]['start_dates'].add(start_date)
    route_stats[route_id]['stop_count'] += len(trip_update['stop_time_update'])
    route_stats[route_id]['unique_stops'].update(
        stop_update['stop_id'] for stop_update in trip_update['stop_time_update']
    )

    # Record arrival times
    for stop_update in trip_update['stop_time_update']:
        route_stats[route_id]['arrival_times'].append(
            datetime.fromtimestamp(stop_update['arrival']['time'])
        )

# Process vehicle positions to count active trips
for entity in vehicle_positions['entity']:
    vehicle = entity['vehicle']
    route_id = vehicle['trip']['route_id']
    trip_id = vehicle['trip']['trip_id']

    if route_id not in route_stats:
        route_stats[route_id] = {
            'trip_count': 0,
            'stop_count': 0,
            'unique_stops': set(),
            'trip_ids': set(),
            'start_dates': set(),
            'arrival_times': []
        }

    route_stats[route_id]['trip_count'] += 1
    route_stats[route_id]['trip_ids'].add(trip_id)

# Create processed data list
processed_data = []

for route_id, stats in route_stats.items():
    # Calculate time span if arrival times exist
    if stats['arrival_times']:
        time_span_hours = (max(stats['arrival_times']) -
                         min(stats['arrival_times'])).total_seconds() / 3600
        trips_per_hour = stats['trip_count'] / max(1, time_span_hours)
    else:
        time_span_hours = 0
        trips_per_hour = 0

    record = {
        'route_id': route_id,
        'trip_count': stats['trip_count'],
        'unique_trips': len(stats['trip_ids']),
        'stop_count': stats['stop_count'],
        'unique_stops': len(stats['unique_stops']),
        'stops_per_trip': stats['stop_count'] / max(1, stats['trip_count']),
        'trip_ids': ','.join(stats['trip_ids']),
        'start_dates': ','.join(stats['start_dates']),
        'time_span_hours': time_span_hours,
        'trips_per_hour': trips_per_hour,
        'first_arrival': min(stats['arrival_times']).isoformat() if stats['arrival_times'] else None,
        'last_arrival': max(stats['arrival_times']).isoformat() if stats['arrival_times'] else None,
        'avg_time_between_trips_minutes': (time_span_hours * 60 / max(1, stats['trip_count'] - 1))
                                        if stats['trip_count'] > 1 and time_span_hours > 0 else 0
    }

    processed_data.append(record)

# Create DataFrame and save to CSV
result_df = pd.DataFrame(processed_data)
result_df = result_df.sort_values('trip_count', ascending=False)  # Sort by popularity
result_df.to_csv('route_popularity_dataset.csv', index=False)

print("Dataset created and saved as 'route_popularity_dataset.csv'")
print(f"Number of routes: {len(result_df)}")
print("\nColumns in the dataset:")
print(result_df.columns.tolist())
print("\nTop 5 most popular routes:")
print(result_df[['route_id', 'trip_count', 'unique_trips', 'unique_stops']].head())

Dataset created and saved as 'route_popularity_dataset.csv'
Number of routes: 21

Columns in the dataset:
['route_id', 'trip_count', 'unique_trips', 'stop_count', 'unique_stops', 'stops_per_trip', 'trip_ids', 'start_dates', 'time_span_hours', 'trips_per_hour', 'first_arrival', 'last_arrival', 'avg_time_between_trips_minutes']

Top 5 most popular routes:
   route_id  trip_count  unique_trips  unique_stops
4       900          24            16           121
13      901          18            13            79
7       902          15            11            93
0       302          12             8            96
1       916          12             8           138


In [15]:
import pandas as pd
import json
from datetime import datetime

# Load all data files
stops_df = pd.read_csv('stops.txt')
with open('VehiclePositions.json', 'r') as f:
    vehicle_positions = json.load(f)
with open('trip_updates.json', 'r') as f:
    trip_updates = json.load(f)
boarding_points_df = pd.read_csv('Transit_Bus_Boarding_Points.csv')

# Create a dictionary to count stop-route visits
stop_route_visits = {}

# Process trip updates to count planned visits per route-stop combination
for entity in trip_updates['entity']:
    trip_update = entity['trip_update']
    route_id = trip_update['trip']['route_id']

    for stop_update in trip_update['stop_time_update']:
        stop_id = stop_update['stop_id']
        stop_route_key = f"{stop_id}_{route_id}"

        if stop_route_key not in stop_route_visits:
            stop_route_visits[stop_route_key] = {
                'stop_id': stop_id,
                'route_id': route_id,
                'visit_count': 0,
                'arrival_times': []
            }

        stop_route_visits[stop_route_key]['visit_count'] += 1
        stop_route_visits[stop_route_key]['arrival_times'].append(
            datetime.fromtimestamp(stop_update['arrival']['time']).isoformat()
        )

# Process vehicle positions to count actual visits
for entity in vehicle_positions['entity']:
    vehicle = entity['vehicle']
    trip_id = vehicle['trip']['trip_id']
    route_id = vehicle['trip']['route_id']

    # Find matching trip update to get current/next stop
    trip_update = next((tu['trip_update'] for tu in trip_updates['entity']
                       if tu['trip_update']['trip']['trip_id'] == trip_id), None)

    if trip_update and 'stop_time_update' in trip_update:
        next_stop_id = trip_update['stop_time_update'][0]['stop_id']
        stop_route_key = f"{next_stop_id}_{route_id}"

        if stop_route_key not in stop_route_visits:
            stop_route_visits[stop_route_key] = {
                'stop_id': next_stop_id,
                'route_id': route_id,
                'visit_count': 0,
                'arrival_times': []
            }

        stop_route_visits[stop_route_key]['visit_count'] += 1

# Create processed data list
processed_data = []

for stop_route_key, stats in stop_route_visits.items():
    stop_id = stats['stop_id']
    route_id = stats['route_id']

    # Try to get stop details from stops_df
    stop_info = stops_df[stops_df['stop_id'] == stop_id]
    if stop_info.empty:
        # If not in stops.txt, create basic info
        record = {
            'stop_id': stop_id,
            'route_id': route_id,
            'stop_name': f"Stop {stop_id}",
            'latitude': None,
            'longitude': None,
            'visit_count': stats['visit_count'],
            'wheelchair_boarding': None,
            'location_type': None,
            'zone_id': None,
            'last_arrival_time': max(stats['arrival_times']) if stats['arrival_times'] else None,
            'first_arrival_time': min(stats['arrival_times']) if stats['arrival_times'] else None,
        }
    else:
        stop = stop_info.iloc[0]
        record = {
            'stop_id': stop_id,
            'route_id': route_id,
            'stop_name': stop['stop_name'],
            'latitude': stop['stop_lat'],
            'longitude': stop['stop_lon'],
            'visit_count': stats['visit_count'],
            'wheelchair_boarding': stop['wheelchair_boarding'],
            'location_type': stop['location_type'],
            'zone_id': stop['zone_id'],
            'last_arrival_time': max(stats['arrival_times']) if stats['arrival_times'] else None,
            'first_arrival_time': min(stats['arrival_times']) if stats['arrival_times'] else None,
        }

    # Try to add boarding point info if available
    boarding_info = boarding_points_df[boarding_points_df['NUMBER'] == int(stop_id) if stop_id.isdigit() else False]
    if not boarding_info.empty:
        boarding = boarding_info.iloc[0]
        record.update({
            'accessible': boarding['ACCESSIBLE'],
            'shelter': boarding['SHELTER'],
            'city': boarding['CITY'],
            'status': boarding['STATUS']
        })

    processed_data.append(record)

# Create DataFrame and save to CSV
result_df = pd.DataFrame(processed_data)
result_df = result_df.sort_values(['route_id', 'visit_count'], ascending=[True, False])  # Sort by route, then popularity
result_df.to_csv('stop_route_popularity_dataset.csv', index=False)

print("Dataset created and saved as 'stop_route_popularity_dataset.csv'")
print(f"Number of stop-route combinations: {len(result_df)}")
print("\nColumns in the dataset:")
print(result_df.columns.tolist())
print("\nTop 5 most popular stop-route combinations:")
print(result_df[['stop_id', 'route_id', 'stop_name', 'visit_count']].head())

Dataset created and saved as 'stop_route_popularity_dataset.csv'
Number of stop-route combinations: 1675

Columns in the dataset:
['stop_id', 'route_id', 'stop_name', 'latitude', 'longitude', 'visit_count', 'wheelchair_boarding', 'location_type', 'zone_id', 'last_arrival_time', 'first_arrival_time', 'accessible', 'shelter', 'city', 'status']

Top 5 most popular stop-route combinations:
     stop_id route_id                                stop_name  visit_count
646    93112      112  Pickering Parkway Terminal Platform B12            3
1487   93441      112        Brock Road Southbound @ Clearside            3
666     3572      112               Zents Westbound @ Tillings            2
667     3573      112        Tillings Southbound @ Scenic Lane            2
668    93580      112         Burkholder Southbound @ Belcourt            2
