the purpose of this code is to find the best model weight parameters for aligning with the Google Maps data using simulation_v7

this file was originally called 250317_setting_best_weight_v1.ipynb

In [None]:
import sys, os
import glob
code_path = os.path.join(os.path.dirname(os.getcwd()), 'code')
sys.path.append(code_path)
from simulation_v7 import *
pd.set_option('display.max_columns', None)

In [None]:
base_dir = os.path.dirname(os.path.dirname(os.getcwd()))
processed_dir = os.path.join(base_dir, 'datasets', 'processed')

path_to_solution = os.path.join(processed_dir, 'gmaps_congestion_route_times.csv')
path_to_model_output_folder = os.path.join(processed_dir, 'congestion_iterations_new_equation')

solution_df = pd.read_csv(path_to_solution)

In [5]:
def time_diff_in_seconds(row):
    # Convert time objects to seconds since midnight
    dep_seconds = row['departure_time'].hour * 3600 + row['departure_time'].minute * 60 + row['departure_time'].second
    comp_seconds = row['completion_time'].hour * 3600 + row['completion_time'].minute * 60 + row['completion_time'].second
    
    # Handle cases where completion time is on the next day
    if comp_seconds < dep_seconds:
        comp_seconds += 24 * 3600
        
    return comp_seconds - dep_seconds

def determine_second_differences_single_output(solution_df, output_df, filename):
    # (1) merge the solution and output dataframes
    merged_df = solution_df.merge(output_df, on=['person_id'], how='left')
    
    # (2) convert time strings to datetime objects
    merged_df['departure_time'] = pd.to_datetime(merged_df['departure_time'], format='%H:%M:%S').dt.time
    merged_df['completion_time'] = pd.to_datetime(merged_df['completion_time'], format='%H:%M:%S').dt.time
    
    # (3) calculate the time traveled in seconds
    merged_df['duration_in_traffic_seconds_model'] = merged_df.apply(time_diff_in_seconds, axis=1)

    # (4) calculate the difference in durations
    merged_df['duration_difference_seconds'] = merged_df['duration_in_traffic_seconds'] - merged_df['duration_in_traffic_seconds_model']
    merged_df['filename'] = filename
    
    return merged_df[['person_id', 'duration_difference_seconds', 'filename']]

def determine_second_differences_multiple_outputs(solution_df, path_to_model_output_folder):
    # (1) get all the files in the model output folder
    output_files = glob.glob(path_to_model_output_folder + "*.csv")
    
    # (2) read in the files and calculate the differences
    all_differences = []
    for file in output_files:
        output_df = pd.read_csv(file)
        output_df['iteration'] = file.split('_')[-1].split('.')[0]
        filename = os.path.basename(file)
        differences = determine_second_differences_single_output(solution_df, output_df, filename)
        all_differences.append(differences)
        
    return pd.concat(all_differences).reset_index(drop=True)

def create_duration_difference_table(long_df, minute_weight_list):
    # minute_weight_list = [1, 5, 10, 15]
    # output: df with unique row per filename
    # columns: one for every minute weight, counting the number of rows (person_id's) within that category

    # (1) create a new columns for each minute weight
    minute_weight_column_names = []
    for minute_weight in minute_weight_list:
        long_df[f'difference_within_{minute_weight}_min'] = long_df['duration_difference_seconds'].apply(lambda x: 1 if x <= minute_weight * 60 else 0)
        minute_weight_column_names.append(f'difference_within_{minute_weight}_min')
    # (2) group by filename and sum the columns
    grouped_df = long_df.groupby('filename')[minute_weight_column_names].sum().reset_index()

    return grouped_df

In [6]:
test = determine_second_differences_multiple_outputs(solution_df, path_to_model_output_folder)
test2 = create_duration_difference_table(test, [1, 5, 10, 15, 50])

In [7]:
test2.head(50)

Unnamed: 0,filename,difference_within_1_min,difference_within_5_min,difference_within_10_min,difference_within_15_min,difference_within_50_min
0,interval_60_congestion_weight_0.01_1mph.csv,8864,9825,9991,9999,10000
1,interval_60_congestion_weight_0.01_for_all_roa...,9788,9972,9998,10000,10000
2,interval_60_congestion_weight_0.02_1mph.csv,8859,9837,9983,10000,10000
3,interval_60_congestion_weight_0.02_for_all_roa...,9727,9966,9998,10000,10000
4,interval_60_congestion_weight_0.03_1mph.csv,8513,9721,9974,9999,10000
5,interval_60_congestion_weight_0.03_for_all_roa...,9680,9952,9994,9999,10000
6,interval_60_congestion_weight_0.04_1mph.csv,8367,9752,9972,9997,10000
7,interval_60_congestion_weight_0.04_for_all_roa...,9586,9945,9995,9999,10000
8,interval_60_congestion_weight_0.05_1mph.csv,8070,9638,9963,10000,10000
9,interval_60_congestion_weight_0.05_for_all_roa...,9504,9926,9995,10000,10000


In [None]:
# save_path = os.path.dirname(os.path.dirname(os.getcwd()))
# test2.to_csv(save_path + "duration_difference_table.csv", index=False)