## Libraries - Packages

In [24]:
import json
import pandas as pd
import networkx as nx
import os
from tqdm import tqdm
from os.path import join
from data_handler import FeatureEngineering
import gc
from scipy.stats import norm
from scipy.signal import convolve
import ast
import copy
import pickle

import numpy as np
import matplotlib.pyplot as plt

## Global Variables

In [25]:
input_path = join("io", "input")
output_path = join("io", "output")
experiments_path = join("io", "experiments")
graph_structured_np = join(output_path, "graph_structured_np")
metrics_path = join(experiments_path, "metrics")
plots_path = join(experiments_path, "plots")

## Preprocess ETA Distributions

In [26]:
def calculate_eta_distributions(avg_speed_forecasts_df, edges_df):
    merged_df = pd.merge(avg_speed_forecasts_df, edges_df, on='edge_id', how='left')
    
    # Convert speed mean from km/h to meters/minute (1 km/h = 1000 m/60 min)
    merged_df['speed_m_per_min'] = merged_df['mean'] * 1000 / 60
    # Calculate mean ETA in minutes
    merged_df['mean_eta'] = merged_df['driving_distance'] * 2 / merged_df['speed_m_per_min']

    # Convert variance of ETA to log variance for consistency with other columns
    merged_df['log_var_eta'] = merged_df['log_var'] / 2
    # Create the final DataFrame with selected columns
    eta_distribution_df = merged_df[['edge_id', 'timestamp', 'mean_eta', 'log_var_eta']]
    return eta_distribution_df

In [27]:
def calculate_probabilities(row, minutes_range):
    mean_eta = row['mean_eta']
    std_eta = row['std_eta']
    # Calculate probabilities for each minute
    probabilities = [norm.cdf(minute, mean_eta, std_eta) - norm.cdf(minute - 1, mean_eta, std_eta) for minute in minutes_range]
    
    # Threshold probabilities below 0.001 to zero
    probabilities = [p if p >= 0.01 else 0 for p in probabilities]
    
    # Renormalize probabilities to sum to 1 if not all are zero
    total_prob = sum(probabilities)
    if total_prob > 0:
        probabilities = [p / total_prob for p in probabilities]

    probabilities.insert(0, 0)
    
    # Trim trailing zeros from the list
    while probabilities and probabilities[-1] == 0:
        probabilities.pop()
    
    return probabilities

In [28]:
def conver_distributions_tolist(eta_distribution_df):

    # Calculate standard deviation from log variance
    eta_distribution_df['std_eta'] = np.sqrt(np.exp(eta_distribution_df['log_var_eta']))
    
    # Find the maximum standard deviation to determine the needed range
    max_std_eta = eta_distribution_df['std_eta'].max()
    
    # Define the maximum range of ETA values based on the maximum standard deviation
    # Assuming we consider mean plus/minus 3 standard deviations to cover 99.7% of the data
    max_eta = int(np.ceil(eta_distribution_df['mean_eta'].max() + 3 * max_std_eta))
    min_eta = max(1, int(np.floor(eta_distribution_df['mean_eta'].min() - 3 * max_std_eta)))
    
    # Minutes range based on the dataset's maximum variability
    minutes_range = np.arange(min_eta, max_eta + 1)

    eta_distribution_df['prob_distributions'] = eta_distribution_df.apply(calculate_probabilities, axis=1, args=(minutes_range,))

    return eta_distribution_df

In [29]:
def eta_distributions_creation(avg_speed_forecasts_df, edges_df):

    avg_speed_forecasts_df.rename(columns={'node_id': 'edge_id'}, inplace=True)
    
    eta_distribution_df = calculate_eta_distributions(avg_speed_forecasts_df, edges_df)
    eta_distribution_df = conver_distributions_tolist(eta_distribution_df)
    eta_distribution_df.to_csv(join(output_path, "interm_eta_distributions.csv"), sep=',', encoding='utf-8', index=False)

    eta_distribution_df['minute_of_day'] = pd.to_datetime(eta_distribution_df['timestamp']).dt.hour * 60 + pd.to_datetime(eta_distribution_df['timestamp']).dt.minute
    eta_distribution_df['minute_of_schedule'] = eta_distribution_df['minute_of_day'] - 419
    eta_distribution_df['ETA'] = (eta_distribution_df['mean_eta'].round() + 1).astype(int)
    eta_distribution_df['prob_distributions'] = eta_distribution_df['prob_distributions'].apply(lambda x: [float(i) for i in x])

    final_columns = ['edge_id','minute_of_day', 'minute_of_schedule', 'ETA', 'prob_distributions']
    eta_distribution_df = eta_distribution_df[final_columns]
    eta_distribution_df.to_csv(join(output_path, "stg_eta_distributions.csv"), sep=',', encoding='utf-8', index=False)

    return eta_distribution_df


In [30]:
def normilize_probabilistic_distribution(ls, threshold=0.01):
    """
    Process a probabilistic distribution by:
    1. Setting all probabilities ≤ threshold to zero.
    2. Renormalizing the probabilities to sum to 1, unless all are below threshold (then keep original).
    3. Removing trailing zeros at the end of the list.
    
    Parameters:
    - ls (list): The input probabilistic distribution.
    - threshold (float): The minimum probability value to keep (default is 1%).

    Returns:
    - list: The processed probabilistic distribution.
    """
    # Convert to numpy array for efficient computation
    ls = np.array(ls, dtype=np.float64)
    
    # Step 1: Set probabilities ≤ threshold to zero
    filtered_ls = np.where(ls <= threshold, 0, ls)  # Replace values ≤ threshold with 0

    # Step 2: Check if all probabilities were below threshold before filtering
    if np.sum(filtered_ls) > 0:  # Only normalize if there are values left
        filtered_ls = filtered_ls / np.sum(filtered_ls)
    else:
        filtered_ls = ls  # Keep original if all values were below threshold

    # Step 3: Remove trailing zeros
    filtered_ls = np.trim_zeros(filtered_ls, trim='b')  # 'b' means trim from the end only
    
    return filtered_ls.tolist()


## Create Possbile Bus Schedules Start Times

In [31]:
def create_bus_schedules(network, start_min, end_min, interval, stochastic=False):
    schedules = {}
    
    # Iterate over each route in the network dictionary
    for route, stops in network.items():
        if stops:  # Ensure there are stops in the list
            first_stop = stops[0]
            # Generate start times from start_min to end_min every 'interval' minutes
            start_times = list(range(start_min, end_min + 1, interval))
            
            if not stochastic:
                # Deterministic mode: Return the start times as a simple list
                schedules[route] = {first_stop: start_times}
            else:
                # Stochastic mode: Convert start times into trimmed probabilistic lists
                prob_distributions = []
                for start_time in start_times:
                    prob_list = [0] * start_time  # Create a list only up to the start time
                    prob_list[start_time - 1] = 1  # Set 100% probability at the specific start time
                    prob_distributions.append(prob_list)
                
                schedules[route] = {first_stop: prob_distributions}

    return schedules


## Calculate Buses Arival Time Using ETA Predictions

In [32]:
def compute_probabilistic_bus_schedules(bus_network, bus_schedules, eta_distribution_df):
    # Convert eta_distribution_df to a dictionary for fast lookup
    
    if isinstance(eta_distribution_df['prob_distributions'].iloc[0], str):
        eta_distribution_df['prob_distributions'] = eta_distribution_df['prob_distributions'].apply(ast.literal_eval)

    eta_dict = {(row['edge_id'], row['minute_of_schedule']): row['prob_distributions'] for _, row in eta_distribution_df.iterrows()}
    
    schedules = {}  # Final output structure

    for bus, schedules_dict in tqdm(bus_schedules.items()):
        # Initialize the nested dictionary for each bus
        schedules[bus] = {}
        
        # Get the route stops for this bus
        route_stops = bus_network.get(bus, [])
        
        # Ensure we have a valid route
        if not route_stops:
            continue  

        # Extract the first stop
        first_stop = route_stops[0]

        # Get the list of start time distributions for this bus route
        start_time_distributions = schedules_dict.get(first_stop, [])  # Extract list of probability distributions
        
        for start_time_distribution in start_time_distributions:  # Iterate over each schedule (as a probability list)
            # Initialize arrival distributions for this schedule
            arrival_distributions = {first_stop: start_time_distribution}  # Start stop follows the given probability distribution
            
            # Iterate over the stops in sequence
            for i in range(len(route_stops) - 1):
                current_stop = route_stops[i]
                next_stop = route_stops[i + 1]
                edge_id = f"{current_stop}_{next_stop}"

                # Find the closest ETA entry in the dataset
                current_distribution = arrival_distributions[current_stop]  # Arrival distribution at current stop
                
                # Find the closest available time in eta_distribution_df
                available_times = [minute for (edge, minute) in eta_dict.keys() if edge == edge_id]
                
                if not available_times:
                    continue  # Skip if there are no ETA values for this route
                
                # Find the closest minute available
                closest_minute = min(available_times, key=lambda x: abs(x - np.argmax(current_distribution)))  
                eta_distribution = eta_dict.get((edge_id, closest_minute), [0])  # Default to a zero probability list if not found
                
                # Compute arrival distribution at the next stop using convolution
                arrival_distribution_next_stop = np.convolve(current_distribution, eta_distribution, mode='full')  
                arrival_distribution_next_stop = normilize_probabilistic_distribution(arrival_distribution_next_stop)
                # Store the computed arrival distribution
                arrival_distributions[next_stop] = arrival_distribution_next_stop

            # Store the computed schedule for this departure time
            for stop, arrival_distribution in arrival_distributions.items():
                if stop not in schedules[bus]:
                    schedules[bus][stop] = []
                schedules[bus][stop].append(arrival_distribution)

    return schedules


In [33]:
def move_key_to_front(schedules, foo_line):
    """
    Reorders the schedules dict so that foo_line is the first key.
    """
    if foo_line not in schedules:
        return schedules  # nothing to reorder

    # Reconstruct dict with 'b0_w' first
    reordered = {foo_line: schedules[foo_line]}
    for k, v in schedules.items():
        if k != foo_line:
            reordered[k] = v
    return reordered


In [34]:
def add_schedule_limits(schedules, stops, start, end, foo_line='b00_w'):
    """
    Adds a hardcoded bus line 'b0_w' to the schedules dictionary for each stop,
    inserting two distributions:
    - One as a single value list: [start]
    - One as a probability distribution list where index `end-1` has value 1
    """
    if foo_line not in schedules:
        schedules[foo_line] = {}

    for stop in stops:
        # Create the two probability-style distributions
        first = [start]
        second = [0] * end
        second[end - 1] = 1  # Set 1 at the end-1 index (since list is 0-indexed)

        schedules[foo_line][stop] = [first, second]

    schedules = move_key_to_front(schedules, foo_line)
    
    return schedules

In [35]:
def convert_distributions_to_means(schedules):
    schedules_mean = copy.deepcopy(schedules)
    
    for bus in schedules_mean:
        for stop in schedules_mean[bus]:
            # Compute the mean for each distribution and flatten the result
            mean_list = []
            for prob_dist in schedules_mean[bus][stop]:
                prob_dist = np.array(prob_dist, dtype=np.float64)
                mean = np.sum(prob_dist * np.arange(1, len(prob_dist)+1))
                mean_list.append(round(mean))
            schedules_mean[bus][stop] = mean_list

    return schedules_mean


## Preprocess Passengers Data

In [36]:
def preprocess_passengers_demand(passengers_demand):
    passengers_demand['minute_of_day'] = pd.to_datetime(passengers_demand['timestamp']).dt.hour * 60 + pd.to_datetime(passengers_demand['timestamp']).dt.minute
    passengers_demand['minute_of_schedule'] = passengers_demand['minute_of_day'] - 419
    return passengers_demand

In [37]:
def convert_passenger_arrivals_to_dict(df):
    """
    Convert a DataFrame with stop_id, passenger_demand_distribution, and minute_of_schedule
    into a dictionary format where stop_id is the key, and the value is a list of passenger demand distributions
    indexed by minute_of_schedule.
    """
    passenger_arrivals = {}

    if isinstance(df['passenger_demand_distribution'].iloc[0], str):
        df['passenger_demand_distribution'] = df['passenger_demand_distribution'].apply(ast.literal_eval)

    for _, row in tqdm(df.iterrows()):
        stop = str(row["stop_id"])
        minute = row["minute_of_schedule"]
        demand_distribution = row["passenger_demand_distribution"]
        
        if stop not in passenger_arrivals:
            passenger_arrivals[stop] = []

        # Ensure correct indexing by padding empty lists if necessary
        while len(passenger_arrivals[stop]) < minute:
            passenger_arrivals[stop].append([])  # Fill missing time slots with empty lists
        
        # Insert the distribution at the correct minute index (adjust for 0-based indexing)
        passenger_arrivals[stop][minute - 1] = demand_distribution  

    return passenger_arrivals


## Calculate Stochastic Gap-Based Passenger Demand 

In [38]:
def compute_probabilistic_gaps(stops, schedules, schedules_mean, passenger_arrivals):
    gap_passengers_dict = {}

    for stop in tqdm(stops):  # Loop over each stop
        if stop not in passenger_arrivals:
            continue  # Skip if no passenger data for this stop

        all_eta_distributions = []  # Store all ETA distributions at this stop
        mean_eta_values = []  # Store mean ETA values at this stop

        # Gather all possible ETA distributions for this stop from all bus lines
        for bus in schedules:
            if stop in schedules[bus]:  # Check if this bus stops here
                for eta_dist in schedules[bus][stop]:
                    all_eta_distributions.append(np.array(eta_dist, dtype=np.float64))  # Ensure float type
                for mean_eta in schedules_mean[bus][stop]:
                    mean_eta_values.append(mean_eta)  # Extract the integer mean ETA value

        # Compute gaps between all possible arrival pairs
        for i in range(len(all_eta_distributions) - 1):
            for j in range(i + 1, len(all_eta_distributions)):
                eta_1 = all_eta_distributions[i]
                eta_2 = all_eta_distributions[j]
                mean_eta_1 = mean_eta_values[i]
                mean_eta_2 = mean_eta_values[j]

                # Determine which ETA has the higher mean
                if mean_eta_1 > mean_eta_2:
                    max_eta, min_eta = eta_1, eta_2
                    max_mean, min_mean = mean_eta_1, mean_eta_2
                elif mean_eta_2 > mean_eta_1:
                    max_eta, min_eta = eta_2, eta_1
                    max_mean, min_mean = mean_eta_2, mean_eta_1
                else:
                    gap_passengers_dict[f"gap_{stop}_{i}_{j}_{mean_eta_2}_{mean_eta_1}"] = 0
                    continue  # Skip calculation if means are equal (gap = 0)

                # Compute cumulative sums (CDFs) of both distributions
                max_cdf = np.cumsum(max_eta)
                min_cdf = np.cumsum(min_eta)

                # Ensure both distributions have the same length by padding the smaller one with 1s
                if len(min_cdf) < len(max_cdf):
                    min_cdf = np.pad(min_cdf, (0, len(max_cdf) - len(min_cdf)), mode='constant', constant_values=1)
                elif len(min_cdf) > len(max_cdf):
                    max_cdf = np.pad(max_cdf, (0, len(min_cdf) - len(max_cdf)), mode='constant', constant_values=1)

                # Compute probability mass between these distributions
                prob_between_buses = np.maximum(min_cdf - max_cdf, 0)  # Ensure no negative values
                # Retrieve the correct passenger arrival distributions for this stop
                passengers_at_stop = passenger_arrivals[stop]

                # Step 1: Multiply each `prob_between_buses` value with the passenger probability distribution at the corresponding timestep
                multiplied_distributions = []
                for t in range(len(prob_between_buses)):
                    if prob_between_buses[t] == 0:
                        continue
                    if t < len(passengers_at_stop):
                        passengers_dist = np.array(passengers_at_stop[t], dtype=np.float64)  # Convert to NumPy array
                        multiplied_dist = passengers_dist * prob_between_buses[t]  # Element-wise multiplication
                        multiplied_distributions.append(multiplied_dist)
                
                # Step 2: Convolve all multiplied distributions sequentially
                if multiplied_distributions:
                    convolved_distribution = multiplied_distributions[0]  # Start with the first distribution
                    for k in range(1, len(multiplied_distributions)):
                        convolved_distribution = np.convolve(convolved_distribution, multiplied_distributions[k], mode='full')
                        total = convolved_distribution.sum()
                        if total > 0:
                            convolved_distribution = convolved_distribution / total

                    # Step 3: Compute the expected number of remaining passengers (mean of final convolved distribution)
                    expected_remaining_passengers = np.sum(convolved_distribution * np.arange(1,len(convolved_distribution)+1))
                else:
                    expected_remaining_passengers = 0  # Default if no valid distributions exist

                # Store result in gap_passengers_dict
                gap_key = f"gap_{stop}_{i}_{j}_{mean_eta_1}_{mean_eta_2}"
                gap_passengers_dict[gap_key] = round(float(expected_remaining_passengers), 4)

    return gap_passengers_dict


## Dictionary IO

In [39]:
def save_dict_to_file(dictionary, filepath):
    """
    Save a dictionary to a file using pickle.
    """
    with open(filepath, 'wb') as file:
        pickle.dump(dictionary, file)


# Execution

## Schedule Paramters

In [40]:
schedule_start = 1
scedule_end = 120  #840
bus_schedule_frequency = 15

## Load Appropriate Data

In [41]:
edges_df = pd.read_csv(join(output_path, "interm_network_edges.csv"), encoding='utf-8', sep=',')
avg_speed_forecasts_df = pd.read_csv(join(output_path, "interm_avg_speed_forecasts.csv"), encoding='utf-8', sep=',')
passengers_demand_df = pd.read_csv(join(output_path, "interm_passenger_demand.csv"), encoding='utf-8', sep=',')

with open(join(input_path, "base_bus_network.json"), 'r', encoding='utf-8') as file:
    bus_network = json.load(file)

## Create Probabilistic Schedules 

In [42]:
eta_distribution_df = eta_distributions_creation(avg_speed_forecasts_df, edges_df)
passengers_demand_df = preprocess_passengers_demand(passengers_demand_df)

bus_network = {route: [stop.split('-')[0] for stop in stops] for route, stops in bus_network.items()}
bus_schedules = create_bus_schedules(bus_network, schedule_start, scedule_end, bus_schedule_frequency, True)

schedules = compute_probabilistic_bus_schedules(bus_network, bus_schedules, eta_distribution_df)
schedules = add_schedule_limits(schedules, stops, schedule_start, scedule_end)
schedules_mean = convert_distributions_to_means(schedules)
save_dict_to_file(schedules_mean, join(output_path, "fct_schedules_mean_sub"))

100%|███████████████████████████████████████████| 12/12 [00:02<00:00,  4.36it/s]


## Prepare Probabilistic Passengers Demand

In [43]:
passengers_demand_df = pd.read_csv(join(output_path, "interm_passenger_demand.csv"), encoding='utf-8', sep=',')

passengers_demand_df = preprocess_passengers_demand(passengers_demand_df)
passengers_demand = convert_passenger_arrivals_to_dict(passengers_demand_df)
stops = sorted(list(passengers_demand.keys()))

211050it [00:06, 33685.29it/s]


## Stochastic Gap-Based Passenger Demand 

In [44]:
# Run the function
gap_passengers_dict = compute_probabilistic_gaps(stops, schedules, schedules_mean, passengers_demand)
save_dict_to_file(gap_passengers_dict, join(output_path, "fct_gap_passengers_sub"))


100%|█████████████████████████████████████████| 210/210 [00:50<00:00,  4.14it/s]
