In [1]:
%matplotlib inline

# Path management
import os
from pathlib import Path

# Get main project directory 
main_dir = str(Path(os.path.abspath('')).parents[0])
os.chdir(main_dir)
print('main dir:',main_dir)

main dir: /Users/pablo/OneDrive/data-science/github/transportAI


In [2]:
# Internal modules
from src import transportAI as tai

# External modules
import ast
import sys
import pandas as pd

main_dir: /Users/pablo/OneDrive/data-science/github/transportAI


In [3]:
# =============================================================================
# 2) NETWORK FACTORY
# ============================================================================
network_name = 'Fresno'

In [4]:
# =============================================================================
# a) READ FRESNO LINK DATA
# =============================================================================

# Reader of geospatial and spatio-temporal data
data_reader = tai.etl.DataReader(network_key=network_name,setup_spark=True)

# Read files
links_df, nodes_df = tai.reader.read_fresno_network(folderpath=tai.dirs['Fresno_network'])

nodes_df.to_csv(tai.dirs['output_folder'] + '/network-data/nodes/'  + 'fresno-nodes-data.csv',
                sep=',', encoding='utf-8', index=False, float_format='%.3f')

# Add link key in dataframe
links_df['link_key'] = [(int(i), int(j), '0') for i, j in zip(links_df['init_node_key'], links_df['term_node_key'])]

22/03/21 13:40:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
# =============================================================================
# a) BUILD NETWORK
# =============================================================================

# Create Network Generator
network_generator = tai.factory.NetworkGenerator()

A = network_generator.generate_adjacency_matrix(links_keys=list(links_df.link_key.values))

fresno_network = \
    network_generator.build_fresno_network(A=A, links_df=links_df, nodes_df=nodes_df, network_name= network_name)


Creating Fresno network

Nodes: 1789, Links: 2413


In [6]:
# =============================================================================
# f) OD
# =============================================================================

# - Periods (6 periods of 15 minutes each)
data_reader.options['od_periods'] = [1, 2, 3, 4]

# Read OD from raw data
Q = tai.reader.read_fresno_dynamic_od(network=fresno_network,
                                  filepath=tai.dirs['Fresno_network'] + '/SR41.dmd',
                                  periods=data_reader.options['od_periods'])

network_generator.write_OD_matrix(network = fresno_network, sparse = True, overwrite_input=False)


Matrix Q (1789, 1789) read in 0.5[s]
Trips from 6970 o-d pairs were loaded
Matrix Q (1789, 1789) written in 0.0[s] with sparse format


In [7]:
# =============================================================================
# g) PATHS
# =============================================================================

# Create path generator
paths_generator = tai.factory.PathsGenerator()

# Generate and Load paths in network
paths_generator.load_k_shortest_paths(network = fresno_network, k=4)
# 
# Write paths and incident matrices
paths_generator.write_paths(network=fresno_network, overwrite_input=False)

network_generator.write_incident_matrices(network = fresno_network,
                                          matrices = {'sparse_C':True, 'sparse_D':True, 'sparse_M':True},
                                          overwrite_input = False)

paths_generator.read_paths(network=fresno_network, update_incidence_matrices=True)

Generating paths
Generating 4 paths per od
27774 paths were generated among 6970 od pairs
Computation time: 189.1 [s]
27774 paths were loaded
Updating incident matrices
Progress(D): |████████████████████| 100.0% 
Matrix D (2413, 27774) generated in 29.4[s]
Progress(M): |████████████████████| 100.0% 
Matrix M (6970, 27774) generated in 1.2[s]
Matrix C (27774, 27774) generated in 6.2[s]
Progress (paths): |████████████████████| 100.0% 
27774 paths were written in 5.2[s]
Matrix C (27774, 27774) written in 7.7[s] with sparse format
Matrix D (2413, 27774) written in 1.2[s] with sparse format
Matrix M (6970, 27774) written in 2.0[s] with sparse format
26380 paths were read in 7.3[s]| 100.0% 
26380 paths were loaded
Updating incident matrices
Progress(D): |████████████████████| 100.0% 
Matrix D (2413, 26380) generated in 27.8[s]
Progress(M): |████████████████████| 100.0% 
Matrix M (6970, 26380) generated in 1.2[s]
Matrix C (26380, 26380) generated in 6.0[s]


In [8]:
# =============================================================================
# c) LINK FEATURES FROM NETWORK FILE
# =============================================================================

# Extract data on link features from network file
link_features_df = links_df[['link_key', 'id', 'link_type', 'rhoj', 'lane', 'ff_speed', 'length']]

# Attributes
link_features_df['link_type'] = link_features_df['link_type'].apply(lambda x: x.strip())
link_features_df['rhoj'] = pd.to_numeric(link_features_df['rhoj'], errors='coerce', downcast='float')
link_features_df['lane'] = pd.to_numeric(link_features_df['lane'], errors='coerce', downcast='integer')
link_features_df['length'] = pd.to_numeric(link_features_df['length'], errors='coerce', downcast='float')

# Load features data
fresno_network.load_features_data(linkdata=link_features_df, link_key = 'link_key')


In [9]:
# =============================================================================
# d) LINK PERFORMANCE FUNCTIONS
# =============================================================================

options = {'tt_units': 'minutes'}

# Create two new features
if options['tt_units'] == 'minutes':
    # Weighting by 60 will leave travel time with minutes units, because speeds are originally in per hour units
    tt_factor = 60

if options['tt_units'] == 'seconds':
    tt_factor = 60 * 60

links_df['ff_speed'] = pd.to_numeric(links_df['ff_speed'], errors='coerce', downcast='float')
links_df['ff_traveltime'] = tt_factor * links_df['length'] / links_df['ff_speed']

bpr_parameters_df = pd.DataFrame({'link_key': links_df['link_key'],
                                  'alpha': 0.15,
                                  'beta': 4,
                                  'tf': links_df['ff_traveltime'],
                                  'k': pd.to_numeric(links_df['capacity'], errors='coerce', downcast='float')
                                  })

fresno_network.set_bpr_functions(bprdata=bpr_parameters_df, link_key = 'link_key')

In [10]:
# =============================================================================
# d) SPATIO-TEMPORAL LINK FEATURES AND TRAFFIC COUNTS
# =============================================================================

dates = ['2019-10-01','2020-10-06']

options['update_ff_tt_inrix'] = True

for date in dates:

    # First Tuesday of October, 2019 (2019-10-01)
    data_reader.select_period(date=date, hour=16)

    # First Tuesday of October, 2020 (2020-10-06)
    data_reader.select_period(date=date, hour=16)

    # =============================================================================
    # SPATIO-TEMPORAL LINK FEATURES
    # =============================================================================

    filepath = tai.dirs['output_folder'] + '/network-data/links/' + str(data_reader.options['selected_date']) \
               + '-fresno-spatiotemporal-link-data.csv'

    spatiotemporal_features_df, spatiotemporal_features_list = data_reader.read_spatiotemporal_data_fresno(
            lwrlk_only=False,
            network=fresno_network,
            selected_period_incidents={'year': [data_reader.options['selected_year']],
                                       'month': [7, 8, 9, 10]},
            data_processing={'inrix_segments': True, 'inrix_data': True, 'census': True, 'incidents': True,
                             'bus_stops': True, 'streets_intersections': True},
            # data_processing={'inrix_segments': False, 'inrix_data': False, 'census': False, 'incidents': False,
            #                  'bus_stops': False, 'streets_intersections': False},
            inrix_matching={'census': False, 'incidents': True, 'bus_stops': True, 'streets_intersections': True},
            buffer_size={'inrix': 200, 'bus_stops': 50, 'incidents': 50, 'streets_intersections': 50},
            tt_units='minutes'
        )

    spatiotemporal_features_df.to_csv(filepath, sep=',', encoding='utf-8', index=False, float_format='%.3f')

    # Test Reader
    spatiotemporal_features_df = pd.read_csv(filepath)

    fresno_network.load_features_data(spatiotemporal_features_df)

    # =============================================================================
    # d) FREE FLOW TRAVEL TIME OF LINK PERFORMANCE FUNCTIONS
    # =============================================================================

    # Create two new features
    if options['tt_units'] == 'minutes':
        # Weighting by 60 will leave travel time with minutes units, because speeds are originally in per hour units
        tt_factor = 60

    if options['tt_units'] == 'seconds':
        tt_factor = 60 * 60

    if options['update_ff_tt_inrix']:
        for link in fresno_network.links:
            if link.link_type == 'LWRLK' and link.Z_dict['speed_ref_avg']!=0:
                # Multiplied by 60 so speeds are in minutes
                link.bpr.tf = tt_factor * link.Z_dict['length'] / link.Z_dict['speed_max']
                # link.bpr.tf = tt_factor * link.Z_dict['length'] / link.Z_dict['speed_ref_avg']
                # else:
                #     link.bpr.tf = links_df[links_df['link_key'].astype(str) == str(link.key)]['ff_traveltime']

        fresno_network.set_bpr_functions(bprdata=bpr_parameters_df, link_key = 'link_key')

    # =============================================================================
    # 3c) DATA CURATION
    # =============================================================================

    # a) Imputation to correct for outliers and observations with zero values because no GIS matching

    features_list = ['median_inc', 'intersections', 'incidents', 'bus_stops', 'median_age',
                     'tt_avg', 'tt_sd','tt_var', 'tt_cv',
                     'speed_ref_avg', 'speed_avg','speed_sd','speed_cv']

    for feature in features_list:
        fresno_network.link_data.feature_imputation(feature =feature, pcts = (2, 98))

    # b) Feature values in "connectors" links
    for key in features_list:
        for link in fresno_network.get_non_regular_links():
            link.Z_dict[key] = 0
    print('Features values of link with types different than "LWRLK" were set to 0')

    # a) Capacity adjustment

    # counts = tai.etl.adjust_counts_by_link_capacity(network = fresno_network, counts = counts)

    # b) Outliers

    # tai.etl.remove_outliers_fresno(fresno_network)

    # =============================================================================
    # 2.2) TRAFFIC COUNTS
    # =============================================================================

    # ii) Read data from PEMS count and perform matching GIS operations to combine station shapefiles

    date_pathname = data_reader.options['selected_date'].replace('-', '_')

    path_pems_counts = tai.dirs['input_folder'] + 'public/pems/counts/data/' + \
                       'd06_text_station_5min_' + date_pathname + '.txt.gz'

    # Load pems station ids in links
    tai.etl.load_pems_stations_ids(network=fresno_network)

    # Read and match count data from a given period

    # Duration is set at 2 because the simulation time for the OD matrix was set at that value
    count_interval_df \
        = data_reader.read_pems_counts_by_period(
        filepath=path_pems_counts,
        selected_period={'hour': data_reader.options['selected_hour'],
                         'duration': int(len(data_reader.options['od_periods']) * 15)})

    # Generate a masked vector that fill out count values with no observations with nan
    counts = tai.etl.generate_fresno_pems_counts(links=fresno_network.links
                                                 , data=count_interval_df
                                                 # , flow_attribute='flow_total'
                                                 # , flow_attribute = 'flow_total_lane_1')
                                                 , flow_attribute='flow_total_lane'
                                                 , flow_factor=1  # 0.1
                                                 )
    # Write counts in csv

    filepath = tai.dirs['output_folder'] + 'network-data/links/' + str(data_reader.options['selected_date']) \
               + '-fresno-link-counts.csv'

    counts_df = pd.DataFrame({'link_key': counts.keys(),
                              'counts': counts.values(),
                              'pems_ids': [link.pems_stations_ids for link in fresno_network.links]})
    counts_df.to_csv(filepath, sep=',', encoding='utf-8', index=False, float_format='%.3f')

    # Read counts from csv
    counts_df = pd.read_csv(filepath, converters={"link_key": ast.literal_eval})

    counts = dict(zip(counts_df['link_key'].values, counts_df['counts'].values))

    # Load counts
    fresno_network.load_traffic_counts(counts=counts)

    # =============================================================================
    # c) WRITE LINK FEATURES AND COUNTS
    # =============================================================================
    summary_table_links_df = tai.descriptive_statistics.summary_table_links(links=fresno_network.links)

    summary_table_links_df.to_csv(tai.dirs['output_folder'] + 'network-data/links/'
                     + str(data_reader.options['selected_date'])+ '-fresno-link-data.csv',
                     sep=',', encoding='utf-8', index=False, float_format='%.3f')


Selected date is 2019-10-01, Tuesday at 16:00

Selected date is 2019-10-01, Tuesday at 16:00
            perc  count
link_types             
LWRLK      71.2%   1717
PQULK      14.4%    348
DMDLK       7.2%    174
DMOLK       7.2%    174

Matching geospatial datasets to all links types" 

Reading network shapefile generated from x,y coordinates and qgis

Reading inrix shapefile of Fresno
Matching INRIX segments (N=13417) with network links
1801 network links were matched (74.6% of links) with a 78.6% confidence

Reading and processing INRIX data with pyspark


22/03/21 13:46:17 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Merging INRIX data of speeds and travel times with network links

Reading census data at the block level
2313 network links were matched (95.9% of links)
100 network links were imputed (4.1% of links)

Reading traffic incidents data


                                                                                

Matching incidents (N=1644) with network links
1861 incidents were matched to 471 links  (19.5% of links)

Matching bus stops (N=1503) with network links
1318 bus stops were matched to 743 links (30.8% of links)

Reading shapefiles with street intersections in Fresno

Matching street intersections (N=17302) with network links
6227 street intersecions were matched to 1727 links (71.6% of links)
Data for feature median_inc was imputed with value 37.2922 among 151 links
Data for feature intersections was imputed with value 3.3928 among 234 links
Data for feature incidents was imputed with value 3.0547 among 1321 links
Data for feature bus_stops was imputed with value 1.5672 among 1107 links
Data for feature median_age was imputed with value 31.5406 among 116 links
Data for feature tt_avg was imputed with value 0.5172 among 309 links
Data for feature tt_sd was imputed with value 0.1251 among 306 links
Data for feature tt_var was imputed with value 0.0258 among 447 links
Data for feature tt

                                                                                


Matching PEMS traffic count measurements in network links
122 links were perfectly matched
19 links counts were imputed using the average traffic counts among lanes

Selected date is 2020-10-06, Tuesday at 16:00

Selected date is 2020-10-06, Tuesday at 16:00
            perc  count
link_types             
LWRLK      71.2%   1717
PQULK      14.4%    348
DMDLK       7.2%    174
DMOLK       7.2%    174

Matching geospatial datasets to all links types" 

Reading network shapefile generated from x,y coordinates and qgis

Reading inrix shapefile of Fresno
Matching INRIX segments (N=13417) with network links
1758 network links were matched (72.9% of links) with a 77.3% confidence

Reading and processing INRIX data with pyspark


                                                                                

Merging INRIX data of speeds and travel times with network links

Reading census data at the block level
2313 network links were matched (95.9% of links)
100 network links were imputed (4.1% of links)

Reading traffic incidents data


                                                                                

Matching incidents (N=1098) with network links
525 incidents were matched to 320 links  (13.3% of links)

Matching bus stops (N=1503) with network links
1357 bus stops were matched to 760 links (31.5% of links)

Reading shapefiles with street intersections in Fresno

Matching street intersections (N=17302) with network links
12590 street intersecions were matched to 1801 links (74.6% of links)
Data for feature median_inc was imputed with value 37.2922 among 151 links
Data for feature intersections was imputed with value 6.6204 among 174 links
Data for feature incidents was imputed with value 1.4964 among 1472 links
Data for feature bus_stops was imputed with value 1.7397 among 1095 links
Data for feature median_age was imputed with value 31.5406 among 116 links
Data for feature tt_avg was imputed with value 0.4586 among 126 links
Data for feature tt_sd was imputed with value 0.0694 among 125 links
Data for feature tt_var was imputed with value 0.0078 among 125 links
Data for feature tt

  mean = diff_sum / diff_num


Data for feature speed_hist_avg was imputed with value nan among 0 links
Data for feature speed_sd was imputed with value 4.0361 among 126 links
Data for feature speed_hist_sd was imputed with value nan among 0 links
Data for feature speed_cv was imputed with value 0.1428 among 125 links
Features values of link with types different than "LWRLK" were set to 0

Reading network shapefile generated from x,y coordinates and qgis
Reading pems counts starting at 16:00 and during 60 minutes


                                                                                


Matching PEMS traffic count measurements in network links
122 links were perfectly matched
19 links counts were imputed using the average traffic counts among lanes
