In [1]:
import sys # for automation and parallelisation
manual, scenario = (True, 'base') if 'ipykernel' in sys.argv[0] else (False, sys.argv[1])

In [2]:
import json
import pandas as pd
import numpy as np
from quetzal.model import stepmodel
from quetzal.engine.engine import od_volume_from_zones
from syspy.skims import skims
from quetzal.io import excel

In [3]:
model_path = '../model/' + scenario + '/'
input_path = '../input/'
input_static_path = '../input_static/'
output_path = '../output/'

In [4]:
# Load scenario parameters
params = excel.read_var(file='../input/parameters.xls', scenario=scenario)

# OD matrix generation

## Needs zone attributes and generation / destination choice / distribution model results

## Saves volumes table

In [5]:
# Transport demand segments
segments = [s.strip() for s in params['general']['demand_segments'].split(';')]
purposes_compulsory = ['commuting', 'business', 'education']
purposes_optional = ['buy/execute', 'leisure', 'accompany']
segments_compulsory = [s for s in segments if s.split('_')[0] in purposes_compulsory]
segments_optional = [s for s in segments if s.split('_')[0] in purposes_optional]

In [6]:
# Load zones
sm = stepmodel.read_json(model_path + 'de_zones')
sm.zones.sort_index(inplace=True)

In [7]:
# Fill POI values
pois = pd.read_csv(input_path + 'spatial_num_pois_raw.csv', index_col='index')
cats = pd.read_excel(input_path + 'spatial_OSM_POI_list.xlsx', sheet_name='categories')
cats['label'] = (cats['key'] + ' ' + cats['value'].fillna('')).str.strip()
for category, columns in cats.loc[cats['category'].notna()
                                 ].groupby('category').agg(
                                {'label': list})['label'].items():
    sm.zones[category] = sm.zones['FID'].map(pois[columns].sum(axis=1)).fillna(0) \
                         * sm.zones['urbanisation'].map(params['poi_change']).astype(float)

In [8]:
# Load data for education
age_groups = pd.read_csv(input_static_path + 'spatial_Zensus_ages_2017_GENESIS.csv',
                         encoding='latin-1', sep=';', skiprows=5, skipfooter=4, na_values='-').dropna()
age_groups.rename(columns={'Unnamed: 1': 'lau_id'}, inplace=True)
age_groups.loc[age_groups['lau_id']==16056, 'lau_id'] = 16063 # Update Eisenach
age_groups = age_groups.groupby('lau_id').sum()
sm.zones['lau_id'] = (sm.zones['FID'].astype(str).str[:-4]).astype(int)
sm.zones['is_urban'] = (sm.zones['urbanisation']==1) & (sm.zones['population']>100000)

  age_groups = pd.read_csv(input_static_path + 'spatial_Zensus_ages_2017_GENESIS.csv',


In [9]:
# Load inter-zonal composite cost from mode choice step
try:
    cc = pd.read_csv(output_path + scenario + '/mode_choice_od_composite_cost.csv')
except FileNotFoundError:
    cc = pd.read_csv(output_path + 'base' + '/mode_choice_od_composite_cost.csv')
cc.set_index(['origin', 'destination'], inplace=True)
cc.sort_index(inplace=True)
cc.clip(lower=0.01, inplace=True)

In [10]:
# inner-zonal cc
inner_cc = pd.read_csv(input_path + 'inner-zonal_cc.csv', index_col=0)
inner_cc.index.name = 'origin'
inner_cc.columns.name = 'segment'

In [11]:
# Compute distances
distance = skims.euclidean(sm.zones, coordinates_unit=sm.coordinates_unit, intrazonal=False)
dist_matrix = (distance.set_index(['origin', 'destination'])['euclidean_distance'].unstack('destination') / 1000).fillna(1e3)

In [12]:
inner_dists = {
    'accompany_car':         5.657834,
    'accompany_no_car':      6.435424,
    'business_car':          7.322597,
    'business_no_car':       7.872050,
    'buy/execute_car':       5.389849,
    'buy/execute_no_car':    6.193252,
    'commuting_car':         8.194012,
    'commuting_no_car':      8.247224,
    'education_car':         6.131394,
    'education_no_car':      6.441189,
    'leisure_car':           6.212798,
    'leisure_no_car':        6.814558
}

In [13]:
# Define mathematical formulations for the cost term
def power_function(CC_D, a, b, c):
    cc, dist = CC_D
    return a * np.power(cc*c + dist*(1-c), -b)
def exponential_function(CC_D, a, b, c):
    cc, dist = CC_D
    return a * np.exp(-b * (cc*c + dist*(1-c)))
def lognormal_function(CC_D, a, b, c):
    cc, dist = CC_D
    return a * np.exp(-b * np.power(np.log(cc*c + dist*(1-c) + 1), 2))

In [65]:
# Load estimated function parameters
abc = pd.read_csv(input_path + 'estimation_results_distribution.csv', index_col=[0,1])

In [66]:
# Compute volumes from emission and attraction for a given segment
def compute_volumes(segment, emission, attraction, f=lognormal_function):
    sm.zones['emission'] = emission
    sm.zones['attraction'] = attraction
    # Create deterrence matrix from CC
    zone_set = sorted(list(set(sm.zones['FID']).intersection(set(cc.index.get_level_values(0))
                           ).intersection(set(cc.index.get_level_values(1))
                           ).intersection(set(dist_matrix.index)
                           ).intersection(set(dist_matrix.columns))))
    cc_matrix = cc[segment].unstack('destination').loc[zone_set, zone_set]
    cc_matrix.fillna(cc_matrix.max().max() * 1.5, inplace=True)
    # Add inner-zonal resistance
    for z in zone_set:
        inner_val = 0 if not z in inner_cc.index else inner_cc.loc[z, segment] 
        cc_matrix.loc[z,z] = inner_val if inner_val > 0 else inner_cc[segment].mean()
        dist_matrix.loc[z,z] = inner_dists[segment]
    # convert CC to deterrence
    deterr = f((cc_matrix, dist_matrix.loc[zone_set, zone_set]),
               abc.loc[(segment, f.__name__), 'a'],
               abc.loc[(segment, f.__name__), 'b'],
               abc.loc[(segment, f.__name__), 'c'])
    # Run doubly constrained distribution
    vol = od_volume_from_zones(sm.zones.loc[sm.zones['FID'].isin(zone_set)], deterr)
    return vol.fillna(0).clip(lower=0).set_index(['origin', 'destination'])

In [67]:
volumes = pd.DataFrame()

## Compulsory trips

Choose the doubly constrained gravity model as distribution method because logit models don't perform well for purposes commuting, business and education.

In [68]:
# Volumes for commuting
days = params['generation']['days_at_work_with_car']
volumes['commuting_car'] = compute_volumes(
    'commuting_car',
    sm.zones['employed'] * sm.zones['car_avail_hh'] * days * 2,
    sm.zones['employment'] * sm.zones['car_avail_hh'] * days * 2)
days = params['generation']['days_at_work_without_car']
volumes['commuting_no_car'] = compute_volumes(
    'commuting_no_car',
    sm.zones['employed'] * (1 - sm.zones['car_avail_hh']) * days * 2,
    sm.zones['employment'] * (1 - sm.zones['car_avail_hh']) * days * 2, exponential_function)

In [69]:
# Volumes for business
# Average business trips per day from employed persons
days = params['generation']['days_at_work_with_car']
factor = params['generation']['business_trip_factor_with_car']
volumes['business_car'] = compute_volumes(
    'business_car',
    sm.zones['employed'] * sm.zones['car_avail_hh'] * days * 2 * factor,
    sm.zones['employment'] * sm.zones['car_avail_hh'] * days * 2 * factor)
factor = params['generation']['business_trip_factor_without_car']
volumes['business_no_car'] = compute_volumes(
    'business_no_car',
    sm.zones['employed'] * (1 - sm.zones['car_avail_hh']) * days * 2 * factor,
    sm.zones['population'] * (1 - sm.zones['car_avail_hh']) * days * 2 * factor)

In [70]:
# Volumes for education
# Emissions are people in education (school, apprenticeship, higher education)
# Attractions are corresponding institutions
emission = \
    {'higher_education':
        sm.zones['lau_id'].map(
            age_groups['18 bis unter 20 Jahre']
            + age_groups['20 bis unter 25 Jahre']
        ) * sm.zones['pop_share']
        * sm.zones['is_urban'].map(lambda urban: params['generation']['students_per_18-25yo_urban']
                                   if urban else params['generation']['students_per_18-25yo_non-urban']),
    'school':
        sm.zones['lau_id'].map(
            age_groups['6 bis unter 10 Jahre']
            + age_groups['10 bis unter 15 Jahre']
            + age_groups['15 bis unter 18 Jahre']
        ) * sm.zones['pop_share']
        * sm.zones['is_urban'].map(lambda urban: params['generation']['pupils_per_6-18yo_urban']
                                   if urban else params['generation']['pupils_per_6-18yo_non-urban']),
    'employment':
        sm.zones['lau_id'].map(
            age_groups['18 bis unter 20 Jahre']
        ) * sm.zones['pop_share']
        * sm.zones['is_urban'].map(lambda urban: params['generation']['apprentices_per_18-20yo_urban']
                                   if urban else params['generation']['apprentices_per_18-20yo_non-urban'])
   }

vol_df = pd.DataFrame(index=volumes.index)
institutions = ['higher_education', 'school', 'employment']
for institution in institutions:
    days = params['generation']['edu_days_in_'+institution]
    no_car_share = params['generation']['edu_share_without_car']
    # With car available
    vol_df[institution+'_car'] = compute_volumes(
        'education_car',
        emission[institution] * sm.zones['car_avail_hh'] * days * 2,
        sm.zones[institution] * sm.zones['car_avail_hh'] * days * 2)
    # Without car available
    vol_df[institution+'_no_car'] = compute_volumes(
        'education_no_car',
        emission[institution] * (1 - sm.zones['car_avail_hh']) * days * 2 * no_car_share,
        sm.zones[institution] * (1 - sm.zones['car_avail_hh']) * days * 2 * no_car_share)
    
# Sum up all these education groups
volumes['education_car'] = vol_df[[i+'_car' for i in institutions]].sum(axis=1)
volumes['education_no_car'] = vol_df[[i+'_no_car' for i in institutions]].sum(axis=1)

## Volumes for non-compulsory trips

Optional. Recommended to use choice model instead.

In [20]:
# Load trip generation
#try:
#    generation = pd.read_csv(model_path + 'generation_volumes.csv', index_col=0)
#except FileNotFoundError:
#    generation = pd.read_csv(model_path.replace(scenario, 'base') + 'generation_volumes.csv', index_col=0)

In [21]:
# Calculate number of trips
#for s in segments_optional:
#    volumes[s] = compute_volumes(s, sm.zones['FID'].map(generation[s]), sm.zones['population'])

## Inner-inter-zonal shares

If not already computed within the distribution

In [22]:
# Take inner-zonal probabilities from MiD2017 data by urbanisation degree
inner_probs = pd.read_csv(input_path + 'inner-zonal_probabilities_agg_urban.csv', index_col=0)

In [23]:
# Compute inner-zonal volumes
seg_cols = [col for col in volumes.columns if col in segments]
in_mask = volumes.index.get_level_values('origin')==volumes.index.get_level_values('destination')
urban_dict = sm.zones['urbanisation'].to_dict()
if len(volumes.loc[in_mask]) == 0:
    sums = volumes.groupby(level='origin').sum()
    inner_probs_zones = pd.Series(sums.index, index=sums.index).apply(
        lambda z: inner_probs.loc[urban_dict[z], seg_cols])
    inner = sums * inner_probs_zones
    inner.index = pd.MultiIndex.from_tuples([(z,z) for z in inner.index], names=['origin', 'destination'])
    volumes = pd.concat([inner, volumes * (1 - inner_probs_zones)]).sort_index()

In [24]:
volumes.sample()

Unnamed: 0_level_0,Unnamed: 1_level_0,commuting_car,commuting_no_car,business_car,business_no_car,education_car,education_no_car
origin,destination,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
59780004,95715501,6.276234e-24,3.367828e-35,7.900459e-07,1e-05,2.608682e-10,1.029182e-13


In [25]:
# Drop rows with NaN
volumes = volumes.loc[~volumes.isna().any(axis=1)]
len(volumes)

21160000

In [26]:
print('Million trips per day: {}'.format(volumes[seg_cols].sum().sum() / 1e6 / 365))

Million trips per day: 80.22899460603382


## Sparsify

In [67]:
# Load OD set
try:
    od_set = json.load(open(model_path + 'od_set.json'))
    od_set = [tuple(l) for l in od_set]
    print('Found OD set with {} pairs'.format(len(od_set)))
    od_set = list(set([(od[0],od[0]) for od in od_set]).union(set(od_set)))
    missing = set(od_set) - set(volumes.index)
    if len(missing) > 0:
        zeros = pd.DataFrame(index=pd.MultiIndex.from_tuples(list(missing), names=['origin', 'destination']),
                             columns=volumes.columns).fillna(0)
        volumes = pd.concat([volumes, zeros])
    volumes = volumes.loc[od_set]
    print('Million trips per day after sparsification: {}'.format(
        volumes[seg_cols].sum().sum() / 1e6 / 365))
except FileNotFoundError:
    print('Saving the full OD set')

Found OD set with 263004 pairs
Million trips per day after sparsification: 49.27574571325191


## Policies

In [27]:
# Multi-purpose densified quarters in agglomeration areas around large cities
max_dist = params['trip_reduction_suburban_quarters']['max_dist_to_city']
if max_dist > 0:
    # Find quarters
    zones = gpd.GeoDataFrame(sm.zones, crs=sm.epsg).to_crs('EPSG:5234') # distance to meter
    cities = zones.loc[(zones['urbanisation']==1) & (zones['population']>200000)]
    centroids = gpd.GeoDataFrame(index=zones.loc[zones['urbanisation']==2].index,
                                 geometry=zones.loc[zones['urbanisation']==2, 'geometry'].centroid)
    quarters = gpd.sjoin_nearest(centroids, cities[['geometry']], max_distance=max_dist)
    # Reduce trips
    for seg in seg_cols:
        volumes.loc[volumes.reset_index()['origin'].isin(quarters.index), seg] *= \
            1 - params['trip_reduction_suburban_quarters'][seg]

## Save

In [71]:
# Save volumes table
sm.volumes = volumes.reset_index(names=['origin', 'destination'])
sm.to_zippedpickles(model_path + 'de_volumes_distribution', only_attributes=['volumes'])

volumes: 100%|███████████████████████████████████████████████████████████████████████████| 9/9 [01:22<00:00,  9.16s/it]


In [29]:
# Plot composite cost distribution
if False:
    import matplotlib.pyplot as plt
    bins = [-1] + list(range(5, 100, 5)) + list(range(100, 300, 50)) + [300, 400, 500, 750, 1000]
    ccd = cc.merge(dist, right_index=True, left_index=True, how='left')
    ccd['bin'] = pd.cut(ccd['length'], bins=bins, labels=bins[1:])
    ccd = ccd.loc[ccd['bin'].notna()]
    ccd['bin'] = ccd['bin'].astype(int)
    purposes = purposes_compulsory+purposes_optional
    fig, ax = plt.subplots(nrows=2, ncols=len(purposes), figsize=(18,6), sharex='all', sharey=False)
    for i,car in {0: '_no_car', 1: '_car'}.items():
        for j in range(len(purposes)):
            cc_means = ccd.groupby('bin')[purposes[j]+car].mean()
            ax[i,j].plot(bins[1:], cc_means)
            ax[i,j].scatter(ccd['length'], ccd[purposes[j]+car], color='red', s=.1, alpha=.2)
            ax[0,j].set_title(purposes[j])
            ax[i,j].set_xscale('log')
    ax[0,0].set_ylabel('no car')
    ax[1,0].set_ylabel('with car')
    plt.savefig(output_path + 'CC_distribution_segmented_600dpi.png', dpi=600)
    plt.savefig(output_path + 'CC_distribution_segmented_300dpi.png', dpi=300)