the purpose of this code is to run the simulation for multiple scenarios and calculate the time differences

this file was originally called 250414_scenario_planning_v2.ipynb

In [2]:
import sys, os
import glob
code_path = os.path.join(os.path.dirname(os.getcwd()), 'code')
sys.path.append(code_path)
from simulation_v7 import *
pd.set_option('display.max_columns', None)

initial settings

In [None]:
base_dir = os.path.dirname(os.path.dirname(os.getcwd()))
processed_dir = os.path.join(base_dir, 'datasets', 'processed')
od_path = os.path.join(processed_dir, '250414_od_matrix_with_hart_designation.csv')

od_matrix = pd.read_csv(od_path)
# origin and destination nodes should be strings not integers
od_matrix['origin_node'] = od_matrix['origin_node'].astype(str)
od_matrix['destination_node'] = od_matrix['destination_node'].astype(str)
od_matrix['earliest_station_node'] = od_matrix['earliest_station_node'].astype(str)

# for_all_road_types = True
# weight = 0.05
# interval_length='60s'
# min_speed = 1

# edges_df, in_route_veh_df, completed_veh_df = run_full_loop(od_matrix, 
#                             weight=weight, 
#                             min_speed=min_speed, 
#                             for_all_road_types=for_all_road_types, 
#                             interval_length=interval_length,
#                             end_time='5:30:00'
#                             )

In [4]:
def adjust_od_matrix_to_scenario(od_matrix, simulation_scenario):
    if od_matrix['earliest_station_position'].notnull().sum() < simulation_scenario:
        raise ValueError("The number of earliest station positions is less than the simulation scenario.")
    # create a copy of the od_matrix
    od_matrix_adjusted = od_matrix.copy()
    # sort ascending on earliest_station_position
    od_matrix_adjusted = od_matrix_adjusted.sort_values(by='earliest_station_position', ascending=True)
    # for the top # (simulation_scenario) rows, replace 'destination_node' with 'earliest_station_node'
    od_matrix_adjusted.loc[od_matrix_adjusted.index[:simulation_scenario], 'destination_node'] = od_matrix_adjusted['earliest_station_node']
    # if destination_node is NaN, drop the row]
    od_matrix_adjusted = od_matrix_adjusted[~od_matrix_adjusted['destination_node'].isnull()]
    od_matrix_adjusted = od_matrix_adjusted[od_matrix_adjusted['destination_node'] != 'nan']
    # convert to string
    od_matrix_adjusted['destination_node'] = od_matrix_adjusted['destination_node'].astype(float).astype(int).astype(str)
    # od_matrix_adjusted['origin_node'] = od_matrix_adjusted['origin_node'].astype(str)
    return od_matrix_adjusted

def run_simulation_scenarios(od_matrix, simulation_scenario_list, save_path):
    for_all_road_types = True
    weight = 0.05
    interval_length='60s'
    min_speed = 1
    # iterate through the simulation scenarios
    for simulation_scenario in simulation_scenario_list:
        # make a copy of the od_matrix with the appropriate adjustments
        print(f"simulation: {simulation_scenario}")
        od_matrix_adjusted = adjust_od_matrix_to_scenario(od_matrix, simulation_scenario)
        # run the simulation
        edges_df, in_route_veh_df, completed_veh_df = run_full_loop(od_matrix_adjusted, 
                            weight=weight, 
                            min_speed=min_speed, 
                            for_all_road_types=for_all_road_types, 
                            interval_length=interval_length,
                            )
        # save the results to a csv file
        completed_veh_df.to_csv(os.path.join(save_path, f"completed_veh_df_{simulation_scenario}.csv"), index=False)
        print(f"Simulation scenario {simulation_scenario} completed and saved")
    return

In [None]:
# simulation_scenario_list = [10_000, 20_000, 30_000, 40_000, 50_000]
# save_path = os.path.join(processed_dir, 'simulation_scenarios')

# run_simulation_scenarios(od_matrix, simulation_scenario_list, save_path)

### Calculate Differences

calculate the difference in travel time between the scenarios

In [None]:
base_status_path = os.path.join(processed_dir, 'congestion_iterations_new_equation', 'interval_60_congestion_weight_0.05_for_all_road_types_1mph.csv')
base_status_df = pd.read_csv(base_status_path)

test_status_path = os.path.join(processed_dir, 'simulation_scenarios', 'completed_veh_df_10000.csv')
test_status_df = pd.read_csv(test_status_path)

In [7]:
town_tracts = [
    '15003980200',
    '15003981400',
    '15003981300',
    '15003005900',
    '15003006000',
    '15003005800',
    '15003005700',
    '15003005200',
    '15003004000',
    '15003003900',
    '15003003801',
    '15003003802',
    '15003003703',
    '15003003702',
    '15003003606',
    '15003003605',
    '15003003603',
    '15003003604',
    '15003003701',
    '15003001906',
    '15003001905',
    '15003001907',
    '15003001901'
]

check what the 'Town' Tracts look like

In [8]:
# import geopandas as gpd
# from keplergl import KeplerGl
# from shapely import wkt

In [None]:
# datasets_dir = os.path.join(base_dir, 'datasets')
# tract_path = os.path.join(datasets_dir, '2020_Census_Tracts.geojson')
# tracts_gdf = gpd.read_file(tract_path)

# town_tracts = tracts_gdf[tracts_gdf['geoid20'].isin(town_tracts)]

In [None]:
# map_ = KeplerGl(height=800)

# map_.add_data(data=town_tracts, name='Town Tracts')
# map_.add_data(data=tracts_gdf, name='All Tracts')

# save_path = os.path.join(base_dir, 'maps')
# # map_.save_to_html(file_name= save_path + '250415_town_tracts.html', read_only=True)

In [11]:
def identify_largest_scenario(simulation_status_folder):
    simulation_status_files = glob.glob(os.path.join(simulation_status_folder, "*.csv"))

    # Create a list of scenario numbers
    scenario_numbers = []
    for simulation_status_file in simulation_status_files:
        scenario_name = os.path.basename(simulation_status_file).split("_")[-1].split(".")[0]
        try:
            scenario_number = int(scenario_name)
            scenario_numbers.append(scenario_number)
        except ValueError:
            # Skip files where the scenario name can't be converted to an integer
            continue
    # Return the maximum scenario number, or 0 if the list is empty
    return max(scenario_numbers) if scenario_numbers else 0

def identify_persons_from_simulation(od_matrix, simulation_scenario):
    if od_matrix['earliest_station_position'].notnull().sum() < simulation_scenario:
        raise ValueError("The number of earliest station positions is less than the simulation scenario.")
    # create a copy of the od_matrix
    od_matrix_adjusted = od_matrix.copy()
    # sort ascending on earliest_station_position
    od_matrix_adjusted = od_matrix_adjusted.sort_values(by='earliest_station_position', ascending=True)
    return od_matrix_adjusted.iloc[:simulation_scenario]['person_id'].tolist()

def calculate_tract_time_difference(base_status_df, simulation_status_df, od_matrix, town_tracts, scenario_name, drop_min_vehicles=20):
    od_matrix_copy = od_matrix.copy()
    # merge the dataframes on person_id
    merged_df = pd.merge(base_status_df, simulation_status_df, on='person_id', suffixes=('_base', '_simulation'))

    # Drop rows with NaN values in the relevant columns
    merged_df = merged_df.dropna(subset=['completion_time_base', 'completion_time_simulation'])

    # take the difference of the two columns
    merged_df['completion_time_base'] = pd.to_datetime(merged_df['completion_time_base'], format='%H:%M:%S')
    merged_df['completion_time_simulation'] = pd.to_datetime(merged_df['completion_time_simulation'], format='%H:%M:%S')
    merged_df['time_difference'] = merged_df['completion_time_base'] - merged_df['completion_time_simulation']
    merged_df['time_difference_minutes'] = merged_df['time_difference'].dt.total_seconds() / 60

    # merge with the od_matrix on person_id
    # convert 5 min designation to actual time and pull out the hour
    od_matrix_copy = prepare_departure_time_for_od_matrix(od_matrix_copy, column ='5min_designation')
    od_matrix_copy['departure_hour'] = od_matrix_copy['departure_time'].str.split(':').str[0].astype(int)
    merged_df = pd.merge(merged_df[['person_id', 'time_difference_minutes']], od_matrix_copy[['person_id', 'origin_tract', 'destination_tract', 'departure_hour']], on='person_id', how='left')
    
    # filter out origin tract in town_tracts
    merged_df['origin_tract'] = merged_df['origin_tract'].astype(str)
    merged_df['destination_tract'] = merged_df['destination_tract'].astype(str)
    merged_df = merged_df[~merged_df['origin_tract'].isin(town_tracts)].copy()
    print(scenario_name)
    print(f"merge_df shape: {merged_df.shape[0]}")
    # filter for destination tract in town_tracts
    merged_df = merged_df[merged_df['destination_tract'].isin(town_tracts)].copy()
    print(f"merge_df shape: {merged_df.shape[0]}")
    # filter out person id's that have been manipulated to an earlier destination node
    person_id_filter_list = identify_persons_from_simulation(od_matrix, int(scenario_name))
    merged_df = merged_df[~merged_df['person_id'].isin(person_id_filter_list)].copy()
    print(f"merge_df shape: {merged_df.shape[0]}")

    # aggregate the data
    agg_df = merged_df.groupby(['origin_tract', 'departure_hour']).agg(
        time_difference_minutes_mean=('time_difference_minutes', 'mean'),
        time_difference_minutes_median=('time_difference_minutes', 'median'),
        total_vehicles=('person_id', 'count')
    ).reset_index()
    # drop tracts that don't meet vehicle threshold
    agg_df = agg_df[agg_df['total_vehicles'] >= drop_min_vehicles].copy()
    return agg_df

def calculate_tract_time_difference_all_scenarios(base_status_df_path, simulation_status_folder, od_matrix, town_tracts):
    # max_scenario = identify_largest_scenario(simulation_status_folder)
    # (1) read the base status df
    base_status_df = pd.read_csv(base_status_df_path)
    # (2) get all the simulation status files
    simulation_status_files = glob.glob(os.path.join(simulation_status_folder, "*.csv"))
    # (3) iterate through the files
    all_scenarios_df = []
    for simulation_status_file in simulation_status_files:
        # read the file
        simulation_status_df = pd.read_csv(simulation_status_file)
        # get the scenario name from the file name
        scenario_name = os.path.basename(simulation_status_file).split("_")[-1].split(".")[0]
        # calculate the time difference
        time_difference_df = calculate_tract_time_difference(base_status_df, simulation_status_df, od_matrix, town_tracts, scenario_name)
        # add the scenario name to the dataframe
        time_difference_df['scenario'] = scenario_name
        all_scenarios_df.append(time_difference_df)
    # (4) concatenate the dataframes
    all_scenarios_df = pd.concat(all_scenarios_df, ignore_index=True)
    return all_scenarios_df

In [12]:
# test = calculate_tract_time_difference(base_status_df, test_status_df, od_matrix, town_tracts)

In [None]:
base_status_path = os.path.join(processed_dir, 'congestion_iterations_new_equation', 'interval_60_congestion_weight_0.05_for_all_road_types_1mph.csv')
simulation_status_folder = os.path.join(processed_dir, 'simulation_scenarios')
save_path = processed_dir

test = calculate_tract_time_difference_all_scenarios(base_status_path, simulation_status_folder, od_matrix, town_tracts)

# test.to_csv(os.path.join(save_path, "250415_simulation_scenarios_time_difference.csv"), index=False)

10000
merge_df shape: 175146
merge_df shape: 67937
merge_df shape: 63608
30000
merge_df shape: 175146
merge_df shape: 67937
merge_df shape: 47667
20000
merge_df shape: 175146
merge_df shape: 67937
merge_df shape: 55591
40000
merge_df shape: 175146
merge_df shape: 67937
merge_df shape: 39724
50000
merge_df shape: 175146
merge_df shape: 67937
merge_df shape: 30807


In [14]:
test.groupby(['scenario']).agg(
    time_difference_minutes_sum=('time_difference_minutes_median', 'sum'),
)

Unnamed: 0_level_0,time_difference_minutes_sum
scenario,Unnamed: 1_level_1
10000,1578.0
20000,2642.5
30000,3373.5
40000,3864.5
50000,4241.5


Calculate total time saved for each scenario

In [19]:
def calculate_tract_time_difference_raw(base_status_df, simulation_status_df, scenario_name):
    # merge the dataframes on person_id
    merged_df = pd.merge(base_status_df, simulation_status_df, on='person_id', suffixes=('_base', '_simulation'))

    # Drop rows with NaN values in the relevant columns
    merged_df = merged_df.dropna(subset=['completion_time_base', 'completion_time_simulation'])

    # take the difference of the two columns
    merged_df['completion_time_base'] = pd.to_datetime(merged_df['completion_time_base'], format='%H:%M:%S')
    merged_df['completion_time_simulation'] = pd.to_datetime(merged_df['completion_time_simulation'], format='%H:%M:%S')
    merged_df['time_difference'] = merged_df['completion_time_base'] - merged_df['completion_time_simulation']
    merged_df['time_difference_minutes'] = merged_df['time_difference'].dt.total_seconds() / 60

    # filter out the persons who have been moved to an earlier destination node
    print(scenario_name)
    person_id_filter_list = identify_persons_from_simulation(od_matrix, int(scenario_name))
    print(f"merged before filter: {merged_df.shape[0]}")
    merged_df = merged_df[~merged_df['person_id'].isin(person_id_filter_list)].copy()
    print(f"merged after filter: {merged_df.shape[0]}")
    return merged_df

def calculate_time_difference_max_scenarios_raw(base_status_df_path, simulation_status_folder):
    max_scenario = identify_largest_scenario(simulation_status_folder)
    # (1) read the base status df
    base_status_df = pd.read_csv(base_status_df_path)
    # (2) get all the simulation status files
    simulation_status_files = glob.glob(os.path.join(simulation_status_folder, "*.csv"))
    # (3) iterate through the files
    all_scenarios_df = []
    for simulation_status_file in simulation_status_files:
        # read the file
        simulation_status_df = pd.read_csv(simulation_status_file)
        # get the scenario name from the file name
        scenario_name = os.path.basename(simulation_status_file).split("_")[-1].split(".")[0]
        # calculate the time difference
        time_difference_df = calculate_tract_time_difference_raw(base_status_df, simulation_status_df, max_scenario)
        # add the scenario name to the dataframe
        time_difference_df['scenario'] = scenario_name
        all_scenarios_df.append(time_difference_df)
    # (4) concatenate the dataframes
    all_scenarios_df = pd.concat(all_scenarios_df, ignore_index=True)
    # (5) aggregate on scenario and sum
    all_scenarios_df = all_scenarios_df.groupby(['scenario']).agg(
        time_difference_minutes_sum=('time_difference_minutes', 'sum'),
    ).reset_index()
    all_scenarios_df['hours_saved'] = all_scenarios_df['time_difference_minutes_sum'] / 60
    return all_scenarios_df

def calculate_time_difference_all_scenarios_raw(base_status_df_path, simulation_status_folder):
    # max_scenario = identify_largest_scenario(simulation_status_folder)
    # (1) read the base status df
    base_status_df = pd.read_csv(base_status_df_path)
    # (2) get all the simulation status files
    simulation_status_files = glob.glob(os.path.join(simulation_status_folder, "*.csv"))
    # (3) iterate through the files
    all_scenarios_df = []
    for simulation_status_file in simulation_status_files:
        # read the file
        simulation_status_df = pd.read_csv(simulation_status_file)
        # get the scenario name from the file name
        scenario_name = os.path.basename(simulation_status_file).split("_")[-1].split(".")[0]
        # calculate the time difference
        time_difference_df = calculate_tract_time_difference_raw(base_status_df, simulation_status_df, scenario_name)
        # add the scenario name to the dataframe
        time_difference_df['scenario'] = scenario_name
        all_scenarios_df.append(time_difference_df)
    # (4) concatenate the dataframes
    all_scenarios_df = pd.concat(all_scenarios_df, ignore_index=True)
    # (5) aggregate on scenario and sum
    all_scenarios_df = all_scenarios_df.groupby(['scenario']).agg(
        time_difference_minutes_sum=('time_difference_minutes', 'sum'),
    ).reset_index()
    all_scenarios_df['hours_saved'] = all_scenarios_df['time_difference_minutes_sum'] / 60
    return all_scenarios_df

def create_scenario_summary_table(base_status_df_path, simulation_status_folder):
    max_scenario = calculate_time_difference_max_scenarios_raw(base_status_df_path, simulation_status_folder)
    all_scenario = calculate_time_difference_all_scenarios_raw(base_status_df_path, simulation_status_folder)
    # merge the two dataframes on scenario
    all_scenario = pd.merge(all_scenario, max_scenario, on='scenario', suffixes=('_all', '_max'))
    all_scenario = all_scenario[['scenario', 'hours_saved_all', 'hours_saved_max']]
    # rename 
    all_scenario.columns = ['scenario', 'hours_saved_dropping_scenario_vehicle_count', 'hours_saved_dropping_50k_vehicles']
    # round to nearest int
    all_scenario['hours_saved_dropping_scenario_vehicle_count'] = all_scenario['hours_saved_dropping_scenario_vehicle_count'].round(0).astype(int)
    all_scenario['hours_saved_dropping_50k_vehicles'] = all_scenario['hours_saved_dropping_50k_vehicles'].round(0).astype(int)
    return all_scenario

In [20]:
create_scenario_summary_table(base_status_path, simulation_status_folder)

50000
merged before filter: 191218
merged after filter: 141219
50000
merged before filter: 191218
merged after filter: 141219
50000
merged before filter: 191218
merged after filter: 141219
50000
merged before filter: 191218
merged after filter: 141219
50000
merged before filter: 191218
merged after filter: 141219
10000
merged before filter: 191218
merged after filter: 181219
30000
merged before filter: 191218
merged after filter: 161219
20000
merged before filter: 191218
merged after filter: 171219
40000
merged before filter: 191218
merged after filter: 151219
50000
merged before filter: 191218
merged after filter: 141219


Unnamed: 0,scenario,hours_saved_dropping_scenario_vehicle_count,hours_saved_dropping_50k_vehicles
0,10000,2674,1888
1,20000,2869,2159
2,30000,3996,3402
3,40000,3703,3414
4,50000,3659,3659
