In [61]:
import numpy as np
import pandas as pd
from shapely import geometry
import shapely.speedups
from tqdm import tqdm
from quetzal.model import stepmodel

# Preparation of the transport network.
## Saves aggregated bus and short-distance rail network.
## Needs PT networks with access and egress.

In [62]:
input_path = '../input_static/'
output_path = '../output/'
model_path = '../model/'

In [63]:
# Loading StepModel with PT networks...
sm = stepmodel.read_json(input_path + 'de_pt_network')
bus = stepmodel.read_json(input_path + 'de_pt_network_bus')
# Loading access and egress
ae = stepmodel.read_json(model_path + 'de_pt_access_egress')

In [64]:
sm.nodes.sample()

Unnamed: 0_level_0,route_type,stop_name,FID,geometry
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
coach_node_FLIXBUS:2855,coach,Potsdam,DE404,POINT (13.06755 52.39055)


In [65]:
# Check nodeset integrity for later steps to work
try:
    sm.integrity_test_nodeset_consistency()
except AssertionError:
    print('Found {} orphan nodes'.format(len(sm.orphan_nodes)))
    sm.nodes.drop(sm.orphan_nodes, inplace=True)
    # Test integrity again
    sm.integrity_test_nodeset_consistency()

Found 5 orphan nodes
no road_links or road_nodes


In [66]:
# Test sequences
# Use an own function because quetzal's takes ages
def test_sequences(trip):
    assert len(trip)==trip['link_sequence'].max(), \
        'broken sequence in trip {}'.format(trip['trip_id'].unique()[0])

In [67]:
# Fix sequences
# Use an own function because quetzal's takes ages
def fix_sequences(trip):
    trip = trip.sort_values('link_sequence')
    # Check link succession
    ind = list(trip.index)
    for i in range(len(trip.index) - 1):
        try:
            assert trip.loc[ind[i], 'b'] == trip.loc[ind[i+1], 'a'], \
                'broken trip {}: stop {} has no successor link'.format(
                    trip['trip_id'].unique()[0], trip.loc[ind[i], 'b'])
        except AssertionError:
            trip.loc[ind[i+1]:ind[-1], 'trip_id'] = \
                trip.loc[ind[i+1]:ind[-1], 'trip_id'] + '_' + str(i)
    # Repair sequences
    if len(trip) != trip['link_sequence'].max():
        trip['link_sequence'] = trip.groupby('trip_id')['link_sequence'].apply(
            lambda t: [j for j in range(1, len(t.index)+1)]).sum()
    return trip

In [68]:
# Test and save broken sequences
def test_sequences_save(trip):
    if len(trip)!=trip['link_sequence'].max():
        return list(trip.index)

In [69]:
tqdm.pandas()
try:
    sm.links.groupby('trip_id').progress_apply(test_sequences)
except AssertionError:
    links = sm.links.groupby('trip_id').progress_apply(fix_sequences).reset_index(level=0, drop=True)
    links.groupby('trip_id').progress_apply(test_sequences)
    sm.links = links

100%|██████████████████████████████████████████████████████████████████████████| 21118/21118 [00:07<00:00, 2852.04it/s]


In [70]:
broken_seqs = bus.links.groupby('trip_id').progress_apply(test_sequences_save)

100%|████████████████████████████████████████████████████████████████████████| 210028/210028 [01:14<00:00, 2816.06it/s]


In [71]:
broken_seqs.loc[broken_seqs.notna()]

trip_id
bus_1000218    [bus_2798066, bus_2798067, bus_2798068, bus_27...
bus_10004      [bus_134951, bus_134952, bus_134953, bus_13495...
bus_1003106    [bus_2802750, bus_2802751, bus_2802752, bus_28...
bus_1003982    [bus_2804123, bus_2804124, bus_2804125, bus_28...
bus_10068      [bus_135639, bus_135640, bus_135641, bus_13564...
                                     ...                        
bus_99702      [bus_729540, bus_729541, bus_729542, bus_72954...
bus_99879      [bus_730188, bus_730189, bus_730190, bus_73019...
bus_999762     [bus_2797494, bus_2797495, bus_2797496, bus_27...
bus_999928     [bus_2797641, bus_2797642, bus_2797643, bus_27...
bus_99996      [bus_730718, bus_730719, bus_730720, bus_73072...
Length: 2178, dtype: object

In [72]:
links = bus.links.loc[broken_seqs.loc[broken_seqs.notna()].sum()
                     ].groupby('trip_id').progress_apply(fix_sequences)

100%|██████████████████████████████████████████████████████████████████████████████| 2178/2178 [01:01<00:00, 35.13it/s]


In [73]:
links.sample()

Unnamed: 0_level_0,Unnamed: 1_level_0,route_id,route_type,a,b,time,trip_id,link_sequence,geometry
trip_id,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bus_453113,bus_1784861,bus_17486,bus,bus_n_146827,bus_n_101299,120.0,bus_453113_0_1_2_3_4_5_6_8,40,"LINESTRING (8.86357 53.20147, 8.84267 53.19446)"


In [74]:
links.shape

(63092, 8)

In [75]:
links.reset_index(level=0, drop=True, inplace=True)

In [76]:
links.groupby('trip_id').progress_apply(test_sequences)

100%|██████████████████████████████████████████████████████████████████████████| 36574/36574 [00:11<00:00, 3285.25it/s]


In [77]:
bus.links = bus.links.drop(broken_seqs.loc[broken_seqs.notna()].sum()).append(links)

In [78]:
bus.links.sample()

Unnamed: 0_level_0,route_id,route_type,a,b,time,trip_id,link_sequence,geometry
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bus_2495760,bus_12218,bus,bus_n_402830,bus_n_207384,180.0,bus_815375,20,"LINESTRING (12.54127 50.84612, 12.54136 50.83693)"


In [79]:
bus.nodes.sample()

Unnamed: 0_level_0,stop_name,route_type,FID,geometry
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bus_n_172631,Reddebeitz,bus,DE934,POINT (11.18059 52.94834)


In [80]:
# Drop nodes outside of zones
if 'FID' in sm.nodes.columns:
    sm.nodes = sm.nodes[sm.nodes['FID'].notna()]
    bus.nodes = bus.nodes[bus.nodes['FID'].notna()]
    print(sm.nodes.shape)
    print(bus.nodes.shape)

(15376, 4)
(413611, 4)


In [81]:
sm.links.loc[sm.links.duplicated(['a', 'b', 'trip_id'], keep=False)]

Unnamed: 0_level_0,a,b,link_sequence,route_id,route_type,time,trip_id,geometry
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [82]:
# Divide nodes
print(sm.nodes.shape)
#disagg_nodes = sm.nodes.loc[sm.nodes['route_type']=='rail_short_distance'].append(bus.nodes)
#sm.nodes = sm.nodes.loc[sm.nodes['route_type']!='rail_short_distance']
disagg_nodes = bus.nodes
print(disagg_nodes.shape)

(15376, 4)
(413611, 4)


In [83]:
# Divide links
print(sm.links.shape)
#disagg_links = sm.links.loc[sm.links['route_type']=='rail_short_distance'].append(bus.links)
#sm.links = sm.links.loc[sm.links['route_type']!='rail_short_distance']
disagg_links = bus.links
print(disagg_links.shape)

(213129, 8)
(3210535, 8)


In [84]:
# Number of trips
len(disagg_links['trip_id'].unique())

244424

## Remove unneccessary stops
Bus and short-distance rail service in the GTFS feeds contain trips with not further connected intermediate stops. Thus, the PT network graph can be reduced without loss of information.

In [85]:
sm.links.sample()

Unnamed: 0_level_0,a,b,link_sequence,route_id,route_type,time,trip_id,geometry
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
rail_short_126705,rail_short_node_12415,rail_short_node_4987,1.0,rail_short_17,rail_short_distance,660.0,rail_short_32958,"LINESTRING (11.09701 47.49145, 11.13927 47.55927)"


In [86]:
# Count the number of links to/from each node
n_links_dict = disagg_links[['a', 'b']].stack().value_counts().to_dict()
connector_set = ae.zone_to_transit[['a', 'b']].append(ae.footpaths[['a', 'b']])
n_connectors_dict = connector_set.stack().value_counts().to_dict()
disagg_nodes['n_links'] = [n_links_dict[i] for i in list(disagg_nodes.index)]
disagg_nodes['n_connectors'] = disagg_nodes.index.map(n_connectors_dict)
disagg_nodes['n_connectors'].replace(np.nan, 0, inplace=True)

In [87]:
disagg_nodes.loc[disagg_nodes.isna().any(axis=1)]

Unnamed: 0_level_0,stop_name,route_type,FID,geometry,n_links,n_connectors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [88]:
print(disagg_nodes.loc[disagg_nodes['route_type']=='bus'].shape)
print(disagg_nodes.loc[disagg_nodes['route_type']!='bus'].shape)

(413611, 6)
(0, 6)


In [89]:
disagg_nodes.sample(2)

Unnamed: 0_level_0,stop_name,route_type,FID,geometry,n_links,n_connectors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bus_n_366710,Stuttgart Nägelestraße,bus,DE111,POINT (9.17357 48.75149),4,0.0
bus_n_176065,"Bausendorf, Parkanlage",bus,DEB22,POINT (6.99489 50.01281),16,0.0


In [90]:
# Keep interconnected rail trips but drop all bus trips
# without connection to another mode or centroid
agg_nodes = disagg_nodes.loc[
    ((disagg_nodes['route_type']!='bus') &
     ((disagg_nodes['n_links'] > 3) |
      (disagg_nodes['n_connectors'] > 0))
    ) | (
        (disagg_nodes['route_type']=='bus') &
        (disagg_nodes['n_connectors'] > 0)
    )]

In [91]:
print(agg_nodes.loc[agg_nodes['route_type']=='bus'].shape)
print(agg_nodes.loc[agg_nodes['route_type']!='bus'].shape)

(12202, 6)
(0, 6)


### Aggregate links within the broken trips

In [92]:
# Mark dropped nodes
disagg_links['relevant'] = disagg_links['a'].isin(list(agg_nodes.index)) | \
    disagg_links['b'].isin(list(agg_nodes.index))

In [93]:
disagg_links.loc[disagg_links['relevant']].shape

(205415, 9)

In [94]:
disagg_links.sample()

Unnamed: 0_level_0,route_id,route_type,a,b,time,trip_id,link_sequence,geometry,relevant
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
bus_1531628,bus_5420,bus,bus_n_343308,bus_n_63069,60.0,bus_346828,9,"LINESTRING (7.70648 52.29541, 7.71183 52.29407)",False


In [95]:
# Function for aggregating links
def agg_trips(trip):
    # Drop links with missing nodes
    trip_agg = trip.loc[trip['relevant']].sort_values('link_sequence')
    if len(trip_agg.index) == 0:
        # Trip is fully irrelevant
        return
    
    missing_nodes = list(set(list(trip_agg['a'])+list(trip_agg['b'])) -
                         set(agg_nodes.index))
    if len(missing_nodes)==0 and len(trip.index)==len(trip_agg.index):
        # This trip is not affected
        return trip
    
    # Repair link succession
    ind = list(trip_agg.index)
    for i in range(len(ind) - 1):
        if trip_agg.loc[ind[i], 'b'] in missing_nodes:
            trip_agg.loc[ind[i + 1], 'a'] = trip_agg.loc[ind[i], 'a']
            try:
                trip_agg.loc[ind[i + 1], 'geometry'] = geometry.LineString(
                    [agg_nodes.loc[trip_agg.loc[ind[i + 1], 'a'], 'geometry'],
                     agg_nodes.loc[trip_agg.loc[ind[i + 1], 'b'], 'geometry']])
            except KeyError:
                return
            trip_agg.drop(ind[i], inplace=True)
            i = i + 1
    
    ind = list(trip_agg.index)
    if len(trip_agg.index) > 0 and trip_agg.loc[ind[0], 'a'] in missing_nodes:
        # Drop unused first link
        trip_agg = trip_agg.iloc[1:]
    if len(trip_agg.index) > 0 and trip_agg.loc[ind[-1], 'b'] in missing_nodes:
        # Drop unused last link
        trip_agg = trip_agg.iloc[:-1]
    ind = list(trip_agg.index)
    if len(ind) == 0:
        return
    
    # Aggregate travel time
    for i in range(len(ind) - 1):
        try:
            assert trip_agg.loc[ind[i], 'b'] == trip_agg.loc[ind[i+1], 'a'], \
                'broken sequence in trip {}: stop {} has no successor link'.format(
                    trip_agg['trip_id'].unique()[0], trip_agg.loc[ind[i], 'b'])
        except AssertionError:
            # Drop this trip
            return
        if trip_agg.loc[ind[i + 1], 'link_sequence'] - trip_agg.loc[ind[i], 'link_sequence'] > 1:
            trip_agg.loc[ind[i], 'time'] = trip.loc[ind[i]:ind[i+1], 'time'].sum() - \
                trip_agg.loc[ind[i+1], 'time'] # pandas slicing includes both boundaries
    
    # Reindex the sequence numbers
    trip_agg['link_sequence'] = [i for i in range(1, len(trip_agg.index)+1)]
    
    return trip_agg

In [96]:
# Faster variant for using multiple cores
'''import multiprocessing as mp
with mp.Pool(processes=10) as p:
    agg_links = pd.concat(p.map(
        agg_trips, [g for _, g in disagg_links.groupby('trip_id')]))'''

"import multiprocessing as mp\nwith mp.Pool(processes=10) as p:\n    agg_links = pd.concat(p.map(\n        agg_trips, [g for _, g in disagg_links.groupby('trip_id')]))"

In [97]:
tqdm.pandas()
agg_links = disagg_links.groupby('trip_id').progress_apply(
    agg_trips).reset_index(level=0, drop=True)

100%|█████████████████████████████████████████████████████████████████████████| 244424/244424 [11:55<00:00, 341.53it/s]


In [98]:
agg_links.drop('relevant', axis=1, inplace=True)
agg_links.shape

(89761, 8)

In [99]:
agg_links.loc[~(agg_links['a'].isin(list(agg_nodes.index))) |
    ~(agg_links['b'].isin(list(agg_nodes.index)))]

Unnamed: 0_level_0,route_id,route_type,a,b,time,trip_id,link_sequence,geometry
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bus_x_39855,bus_7850,bus,bus_n_x_219536,bus_n_x_218102,300.0,bus_166565,1,"LINESTRING (12.06916 50.89061, 12.07802 50.88273)"
bus_1005821,bus_4388,bus,bus_n_325822,bus_n_325591,120.0,bus_169264,1,"LINESTRING (12.83847 54.20343, 12.82188 54.20661)"
bus_1273803,bus_7445,bus,bus_n_27146,bus_n_341553,120.0,bus_252428,1,"LINESTRING (9.00301 49.74322, 9.00161 49.73786)"
bus_x_10041,bus_7850,bus,bus_n_x_219536,bus_n_x_218102,300.0,bus_29142,1,"LINESTRING (12.06916 50.89061, 12.07802 50.88273)"
bus_x_60054,bus_7850,bus,bus_n_x_219536,bus_n_x_218102,300.0,bus_301648,1,"LINESTRING (12.06916 50.89061, 12.07802 50.88273)"
bus_1821917,bus_16726,bus,bus_n_225734,bus_n_116236,240.0,bus_469669,1,"LINESTRING (7.71711 47.95744, 7.75401 47.95625)"
bus_x_85938,bus_9107,bus,bus_n_x_219536,bus_n_x_218102,300.0,bus_523235,1,"LINESTRING (12.06916 50.89061, 12.07802 50.88273)"
bus_x_97194,bus_7850,bus,bus_n_x_426751,bus_n_x_325249,240.0,bus_636300,1,"LINESTRING (12.07247 50.90391, 12.08129 50.89410)"
bus_2310087,bus_12805,bus,bus_n_117738,bus_n_263121,120.0,bus_712299,1,"LINESTRING (11.75354 51.79189, 11.75072 51.79629)"
bus_2477006,bus_5783,bus,bus_n_222182,bus_n_85142,180.0,bus_805166,1,"LINESTRING (7.38907 53.08167, 7.38792 53.08975)"


In [100]:
# Drop these erronous trips
ids = agg_links.loc[~(agg_links['a'].isin(list(agg_nodes.index))) |
                    ~(agg_links['b'].isin(list(agg_nodes.index)))].index
agg_links = agg_links.loc[~agg_links['trip_id'].isin(list(agg_links.loc[ids, 'trip_id']))]

### Merge aggregated links and nodes with the model

In [101]:
# Re-add links to model
sm.links = sm.links.append(agg_links)
sm.links.shape

(302878, 8)

In [102]:
# Re-add nodes to the model
sm.nodes = sm.nodes.append(agg_nodes)
sm.nodes.shape

(27578, 6)

In [103]:
try:
    sm.integrity_test_nodeset_consistency()
except AssertionError:
    print('Number of orphan nodes: {}'.format(
        len(sm.orphan_nodes)))
    print('Number of missing nodes: {}'.format(
        len(sm.missing_nodes)))

Number of orphan nodes: 0
Number of missing nodes: 18


In [104]:
# Leave these nodes if they connect by more than one
# footpath or access/egress link
assert len(agg_nodes.loc[sm.orphan_nodes].loc[agg_nodes['n_connectors']>1].index) == len(sm.orphan_nodes)

In [105]:
sm.nodes.drop(['n_links', 'n_connectors'], axis=1, inplace=True)

In [106]:
sm.links.groupby('trip_id').progress_apply(test_sequences)

100%|██████████████████████████████████████████████████████████████████████████| 66674/66674 [00:21<00:00, 3140.20it/s]


In [107]:
# Drop frequencies of unused nodes
sm.frequencies = sm.frequencies.loc[
    sm.frequencies['stop_id'].isin(sm.nodes.index)
].append(bus.frequencies.loc[
    bus.frequencies['stop_id'].isin(sm.nodes.index)
]).reset_index()
sm.frequencies[['trip_id', 'hour']] = sm.frequencies[['trip_id', 'hour']].astype(int)

In [108]:
sm.frequencies.shape

(186256, 4)

## Map nodes to zones
Needed for the accessibility calculation

In [109]:
# Nodes must be a GeoDataFrame
if 'FID' not in sm.nodes.columns:
    import geopandas as gpd
    sm.nodes = gpd.GeoDataFrame(sm.nodes, crs=sm.epsg)
    shapely.speedups.enable()
    sm.nodes['FID'] = np.nan
    for _, zone in tqdm(sm.zones.iterrows(), total=sm.zones.shape[0]):
        sm.nodes.loc[sm.nodes['geometry'].within(zone['geometry']), 'FID'] = zone['FID']

In [110]:
sm.nodes[sm.nodes['FID'].isna()]

Unnamed: 0_level_0,FID,geometry,route_type,stop_name
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


## Save model


In [111]:
# Add bus service to ancilliary
sm.agencies = sm.agencies.append(bus.agencies)
sm.pt_routes = sm.pt_routes.append(bus.pt_routes)

In [112]:
# Now, we have bus services in the same tables
sm.pt_route_types.append('bus')

In [113]:
# Reduce file size by shortening node index names
sm.nodes['index'] = [i.replace('rail_short_node', 'r_s_n') for i in sm.nodes.index]
sm.nodes.set_index('index', drop=True, inplace=True)
sm.links['a'] = sm.links['a'].apply(lambda n: n.replace('rail_short_node', 'r_s_n'))
sm.links['b'] = sm.links['b'].apply(lambda n: n.replace('rail_short_node', 'r_s_n'))

In [114]:
sm.nodes['index'] = [i.replace('rail_long_node', 'r_l_n') for i in sm.nodes.index]
sm.nodes.set_index('index', drop=True, inplace=True)
sm.links['a'] = sm.links['a'].apply(lambda n: n.replace('rail_long_node', 'r_l_n'))
sm.links['b'] = sm.links['b'].apply(lambda n: n.replace('rail_long_node', 'r_l_n'))

In [115]:
# Shorten link index names
sm.links['index'] = [i.replace('rail_long', 'r_l').replace('rail_short', 'r_s')
                     for i in sm.links.index]
sm.links.set_index('index', drop=True, inplace=True)

In [116]:
# Shorten route type names
type_dict = {'rail_short_distance': 'rail_short', 'rail_long_distance': 'rail_long'}
sm.links['route_type'] = sm.links['route_type'].replace(type_dict)
sm.nodes['route_type'] = sm.nodes['route_type'].replace(type_dict)
sm.pt_route_types = [t.replace('_distance', '') for t in sm.pt_route_types]

In [117]:
sm.links.loc[sm.links['route_type']=='rail_short'].sample()

Unnamed: 0_level_0,a,b,geometry,link_sequence,route_id,route_type,time,trip_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
r_s_59470,r_s_n_547,r_s_n_13044,"LINESTRING (10.02529 47.55421, 9.95354 47.62062)",10.0,rail_short_33,rail_short,540.0,rail_short_10474


In [118]:
# Split links in graph and auxiliary information
# for file sizes being compatible with github's size limit
cols = ['link_sequence', 'route_id', 'time', 'trip_id']
auxiliary = sm.links[cols]
sm.links.drop(cols, axis=1, inplace=True)

In [119]:
sm.links.shape

(302878, 4)

In [120]:
# Saving model...
sm.to_json(model_path + 'de_pt_network_agg',
           only_attributes=['zones', 'links', 'nodes', 'pt_route_types'],
           encoding='utf-8')
sm.to_json(model_path + 'de_pt_network_ancillary',
           only_attributes=['agencies', 'pt_routes', 'frequencies'],
           encoding='utf-8')

to_hdf(overwriting): 100%|█████████████████████████████████████████████████████████████| 39/39 [00:46<00:00,  1.19s/it]
to_hdf(overwriting): 100%|█████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 43.87it/s]


In [121]:
# Save auxiliary information seperately
auxiliary['index'] = auxiliary.index
auxiliary.reset_index(drop=True, inplace=True)
auxiliary.to_json(model_path + 'de_pt_network_agg/links_quetzaldata.json')