In [1]:
import sys # for automation and parallelisation
manual, scenario = (True, 'base') if 'ipykernel' in sys.argv[0] else (False, sys.argv[1])
if manual:
    %matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
from quetzal.model import stepmodel
from quetzal.engine.engine import od_volume_from_zones
from quetzal.io import excel
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme import expressions as ex

PyTables is not installed. No support for HDF output.
SQLalchemy is not installed. No support for SQL output.


In [3]:
model_path = '../model/' + scenario + '/'
input_path = '../input/'
input_static_path = '../input_static/'
output_path = '../output/'

In [4]:
# Load scenario parameters
params = excel.read_var(file='../input/parameters.xls', scenario=scenario)

# OD matrix generation

## Needs zone attributes and destination choice model results

## Saves volumes table

In [5]:
# Transport demand segments
segments = [s.strip() for s in params['general']['demand_segments'].split(';')]
purposes_compulsory = ['commuting', 'business', 'education']
purposes_optional = ['buy/execute', 'leisure', 'accompany']
segments_compulsory = [s for s in segments if s.split('_')[0] in purposes_compulsory]
segments_optional = [s for s in segments if s.split('_')[0] in purposes_optional]

In [6]:
# Load zones
sm = stepmodel.read_json(model_path + 'de_zones')

## Calculate probabilities of inner/inter-zonal choice for optional trips

In [7]:
# Fill POI values
pois = pd.read_csv(input_path + 'spatial_num_pois_raw.csv', index_col='index')
cats = pd.read_excel(input_path + 'spatial_OSM_POI_list.xlsx', sheet_name='categories')
cats['label'] = (cats['key'] + ' ' + cats['value'].fillna('')).str.strip()
for category, columns in cats.loc[cats['category'].notna()
                                 ].groupby('category').agg(
                                {'label': list})['label'].items():
    sm.zones[category] = sm.zones['FID'].map(pois[columns].sum(axis=1)).fillna(0)

In [8]:
# Load inter-zonal composite cost from mode choice step
cc = pd.read_csv(output_path + scenario + '/mode_choice_od_composite_cost.csv')
cc.set_index(['origin', 'destination'], inplace=True)
# Rename segments to integer values
cc.columns = pd.MultiIndex.from_tuples(
    [(seg.split('_')[0], {'no': 0, 'car': 1}[seg.split('_')[1]])
     for seg in cc.columns],
    names=['purpose_model', 'car_av'])
# Reshape the table into a mergable format
# Use the mean CC to all destinations as accessibility
cc = cc.unstack('destination').stack('purpose_model').stack('car_av').mean(axis=1)
cc = cc.unstack('purpose_model')

In [9]:
# Load inner/inter choice model results
betas = pd.read_csv(input_path + 'inner-inter_betas.csv', index_col=0)

In [10]:
# Simulate the probabilities from the zones table using biogeme
inner_probs = pd.DataFrame(index=sm.zones.index)
car_names = {0: '_no_car', 1: '_car'}
for car in [0,1]:
    for p in purposes_optional:
        # Add accessibility for this segment to database
        sm.zones['acc_'+p] = sm.zones.merge(cc.xs(car, level='car_av'),
                                            how='left', left_index=True, right_index=True
                                           )[p]
        # Create the database
        zones_db = db.Database('zones', sm.zones.loc[
            sm.zones['acc_'+p].notna(), # might be NaN from previous sampling in mode choice step
            ['employment', 'population', 'area', 'acc_'+p]
            + list(cats.loc[cats['category'].notna(), 'category'].unique())])
        globals().update(zones_db.variables)
        
        # Define utility formulations as in cal22
        # Define parameters
        ASC_0 = ex.Beta('ASC_0', 0, None, None, 1)
        ASC_1 = ex.Beta('ASC_1', 0, None, None, 0)
        b_pop = ex.Beta('b_pop', 0, None, None, 0)
        b_attr = ex.Beta('b_attr', 0, None, None, 0)
        b_acc = ex.Beta('b_acc', 0, None, None, 0)

        # Define the utility formulation by purpose
        if p == 'commuting':
            V= {0:
                ASC_0
                + ex.log(1+globals()['employment'])*b_attr
                + ex.log((1+globals()['population'])/globals()['area'])*b_pop,
                1:
                ASC_1
                + globals()['acc_commuting'] * b_acc}
        elif p == 'business':
            V= {0:
                ASC_0
                + ex.log(1+globals()['employment'])*b_attr
                + ex.log((1+globals()['population'])/globals()['area'])*b_pop,
                1:
                ASC_1
                + globals()['acc_business'] * b_acc}
        elif p == 'education':
            V= {0:
                ASC_0
                + ex.log(1+globals()['childcare'] + globals()['school'] + globals()['higher_education'])*b_attr
                + ex.log((1+globals()['population'])/globals()['area'])*b_pop,
                1:
                ASC_1
                + globals()['acc_education'] * b_acc}
        elif p == 'buy/execute':
            V= {0:
                ASC_0
                + ex.log(1+globals()['shop'] + globals()['medical'] + globals()['special_shop'])*b_attr
                + ex.log((1+globals()['population'])/globals()['area'])*b_pop,
                1:
                ASC_1
                + globals()['acc_buy/execute'] * b_acc}
        elif p == 'leisure':
            V= {0:
                ASC_0
                + ex.log(1+globals()['daily_leisure'] + globals()['holiday'] + globals()['occasional_leisure'])*b_attr
                + ex.log((1+globals()['population'])/globals()['area'])*b_pop,
                1:
                ASC_1
                + globals()['acc_leisure'] * b_acc}
        elif p == 'accompany':
            V= {0:
                ASC_0
                + ex.log(1+globals()['medical'] + globals()['school'] + globals()['childcare'])*b_attr
                + ex.log((1+globals()['population'])/globals()['area'])*b_pop,
                1:
                ASC_1
                + globals()['acc_accompany'] * b_acc}
        
        # Simulate
        simulate = {j: models.logit(V, None, int(j)) for j in [0,1]}
        model = bio.BIOGEME(zones_db, simulate, numberOfThreads=1)
        probs = model.simulate(theBetaValues=betas[p+car_names[car]].to_dict())
        
        # Calculate generation volumes
        inner_probs[p+car_names[car]] = probs[0]

In [11]:
# If all that doesn't make sense:
# Take inner-zonal probabilities from MiD2017 data by urbanisation degree
#inner_probs = pd.read_csv(input_path + 'inner-zonal_probabilities_agg_urban.csv', index_col=0)
#inner_probs

## Compute volumes (OD matrix) for optional trips

$F_{ods} = N_{os} * p^{dest}_{ods} * (1 - p^{inner}_{os})$

Indecies:
* o: origin zone
* d: destination zone
* s: demand segment

In [12]:
# Load probability results from logit step
dm = stepmodel.read_zippedpickles(model_path + 'de_destination_choice')

utility_values: 100%|████████████████████████████████████████████████████████████████████| 7/7 [00:03<00:00,  2.28it/s]


In [13]:
# Extract probability values and create new table from it
prob = dm.probabilities.set_index(['origin', 'segment']).drop(columns=['destination', 'root'])
prob.columns.name = 'destination'
volumes = prob.stack('destination').unstack('segment').reset_index()
volumes.columns.name = None
# Fill 0 for segments that were not covered by logit models
for seg in set(segments) - set(volumes.columns):
    volumes[seg] = 0
volumes.sample(2)

Unnamed: 0,origin,destination,accompany_car,accompany_no_car,buy/execute_car,buy/execute_no_car,leisure_car,leisure_no_car,business_car,education_no_car,commuting_car,education_car,commuting_no_car,business_no_car
1993453,DE406_2,DE133_1,1.108507e-07,2e-06,8.238047e-26,8.89979e-13,5e-06,5.745226e-09,0,0,0,0,0,0
3651911,DEA47_1,DEA29_2,6.177531e-07,2.3e-05,7.556094e-09,2.408245e-05,2.4e-05,0.0002702592,0,0,0,0,0,0


In [14]:
# Create inner-zonal probabilities
in_mask = volumes['origin']==volumes['destination']
zone_list = list(set(volumes['origin']))
if len(volumes.loc[in_mask]) == 0:
    volumes = volumes.append(pd.DataFrame({'origin': zone_list,
                                           'destination': zone_list})).reset_index()
    in_mask = volumes['origin']==volumes['destination']
urban_dict = sm.zones['urbanisation'].to_dict()
for seg in segments_optional:
    probs = inner_probs[seg]
    # Correct probabilities in volumes
    if len(probs) < len(zone_list): # aggregated probabilities by urbanisation degree
        volumes.loc[in_mask, seg] = volumes.loc[in_mask, 'origin'].map(urban_dict).map(probs)
        volumes.loc[~in_mask, seg] = volumes.loc[~in_mask, seg] * \
                                     volumes.loc[~in_mask, 'origin'].map(urban_dict).map(1 - probs)
    else:
        volumes.loc[in_mask, seg] = volumes.loc[in_mask, 'origin'].map(probs)
        volumes.loc[~in_mask, seg] = volumes.loc[~in_mask, seg] * \
                                     volumes.loc[~in_mask, 'origin'].map(1 - probs)

In [15]:
# make sure all probabilities sum up to 1 for each zone in each segments
#volumes.groupby('origin')[segments_optional].sum().describe()

In [16]:
# calculate number of trips
generation = pd.read_csv(input_path + 'generation_volumes.csv', index_col=0)
# Manually correct the choice model
correct = {s: 1.2 for s in segments}
correct['buy/execute_no_car'] *= 0.2
correct['leisure_no_car'] *= 0.2
correct['accompany_no_car'] *= 0.2
for s in segments_optional:
    volumes[s] = volumes[s] * volumes['origin'].map(generation[s]) * correct[s]

In [17]:
# check the sum (million trips per day)
volumes[segments_optional].sum().sum() / 1e6 / 365

127.2037539287193

In [18]:
# inter-zonal trips (billion per year)
volumes.loc[volumes['origin']!=volumes['destination'], segments_optional].sum().sum() / 1e9

16.335694337327656

## Compulsory trips

Choose the doubly constrained gravity model as distribution method because logit models don't perform well for purposes commuting, business and education.

In [19]:
sm.zones.sort_index(inplace=True)

In [20]:
# Load inter-zonal composite cost from mode choice step
cc = pd.read_csv(output_path + scenario + '/mode_choice_od_composite_cost.csv')
cc.set_index(['origin', 'destination'], inplace=True)
cc.sort_index(inplace=True)

In [21]:
# Get the resistance to move (composite cost from mode choice)
distances = pd.read_csv(output_path + 'distances_centroids.csv')
distances.set_index(['origin', 'destination'], inplace=True)
def get_deterrence(segment, exp=1):
    # Restrict to a distance threshold
    merged = pd.merge(cc[segment], distances.fillna(1000), how='left',
                      left_index=True, right_index=True)
    merged.loc[merged['length']>params['distribution_cutoff'][segment], segment] = 1e6
    deterrence_matrix = merged[segment].unstack('destination')
    # Add inner-zonal resistance: the minimum
    for z in deterrence_matrix.index:
        deterrence_matrix.loc[z,z] = deterrence_matrix.min(axis=1)[z]
    return np.power(deterrence_matrix.fillna(deterrence_matrix.max() * 10), -exp)

In [22]:
# Compute volumes from emission and attraction for a given segment
def compute_volumes(segment, emission, attraction, exp=1):
    sm.zones['emission'] = emission
    sm.zones['attraction'] = attraction
    vol = od_volume_from_zones(sm.zones, get_deterrence(segment, exp))
    volumes[segment] = volumes.merge(vol, how='left', on=['origin', 'destination']
                                    )['volume'].fillna(0)

In [23]:
# Volumes for commuting
days = params['generation']['days_at_work_with_car']
compute_volumes('commuting_car',
                sm.zones['employed'] * sm.zones['car_avail_hh'] * days * 2,
                sm.zones['employment'] * sm.zones['car_avail_hh'] * days * 2,
                params['distribution_exponent']['commuting_car']
               )
days = params['generation']['days_at_work_without_car']
compute_volumes('commuting_no_car',
                sm.zones['employed'] * (1 - sm.zones['car_avail_hh']) * days * 2,
                sm.zones['employment'] * (1 - sm.zones['car_avail_hh']) * days * 2,
                params['distribution_exponent']['commuting_no_car']
               )

In [24]:
# Volumes for business
# Average business trips per day from employed persons
days = params['generation']['days_at_work_with_car']
factor = params['generation']['business_trip_factor_with_car']
compute_volumes('business_car',
                sm.zones['employed'] * sm.zones['car_avail_hh'] * days * 2 * factor,
                sm.zones['employment'] * sm.zones['car_avail_hh'] * days * 2 * factor,
                params['distribution_exponent']['business_car']
               )
factor = params['generation']['business_trip_factor_without_car']
compute_volumes('business_no_car',
                sm.zones['employed'] * (1 - sm.zones['car_avail_hh']) * days * 2 * factor,
                sm.zones['employment'] * (1 - sm.zones['car_avail_hh']) * days * 2 * factor,
                params['distribution_exponent']['business_car']
               )

In [25]:
# Load data for education
age_groups = pd.read_csv(input_static_path + 'spatial_Zensus_ages_2017_GENESIS.csv',
                         encoding='latin-1', sep=';', skiprows=5, skipfooter=4, na_values='-').dropna()
age_groups.rename(columns={'Unnamed: 1': 'lau_id'}, inplace=True)
age_groups.loc[age_groups['lau_id']==16056, 'lau_id'] = 16063 # Update Eisenach
age_groups = age_groups.groupby('lau_id').sum()
if not 'lau_id' in sm.zones.columns:
    sm.zones['lau_id'] = sm.zones['ARS'].apply(lambda l: int(str(l[0])[:-4]))
sm.zones['is_urban'] = (sm.zones['urbanisation']==1) & (sm.zones['population']>100000)

In [26]:
# Volumes for education
# Emissions are people in education (school, apprenticeship, higher education)
# Attractions are corresponding institutions
emission = \
    {'higher_education':
        sm.zones['lau_id'].map(
            age_groups['18 bis unter 20 Jahre']
            + age_groups['20 bis unter 25 Jahre']
        ) * sm.zones['pop_share']
        * sm.zones['is_urban'].map(lambda urban: params['generation']['students_per_18-25yo_urban']
                                   if urban else params['generation']['students_per_18-25yo_non-urban']),
    'school':
        sm.zones['lau_id'].map(
            age_groups['6 bis unter 10 Jahre']
            + age_groups['10 bis unter 15 Jahre']
            + age_groups['15 bis unter 18 Jahre']
        ) * sm.zones['pop_share']
        * sm.zones['is_urban'].map(lambda urban: params['generation']['pupils_per_6-18yo_urban']
                                   if urban else params['generation']['pupils_per_6-18yo_non-urban']),
    'employment':
        sm.zones['lau_id'].map(
            age_groups['18 bis unter 20 Jahre']
        ) * sm.zones['pop_share']
        * sm.zones['is_urban'].map(lambda urban: params['generation']['apprentices_per_18-20yo_urban']
                                   if urban else params['generation']['apprentices_per_18-20yo_non-urban'])
   }

vol_df = volumes[['origin', 'destination']].copy()
institutions = ['higher_education', 'school', 'employment']
for institution in institutions:
    days = params['generation']['edu_days_in_'+institution]
    no_car_share = params['generation']['edu_share_without_car']
    
    # With car available
    sm.zones['emission'] = emission[institution] * sm.zones['car_avail_hh'] * days * 2
    sm.zones['attraction'] = sm.zones[institution] * sm.zones['car_avail_hh'] * days * 2
    vol_df[institution+'_car'] = vol_df.merge(
        od_volume_from_zones(sm.zones,
                             get_deterrence('education_car',
                                            params['distribution_exponent']['education_car']
                                           )),
        how='left', on=['origin', 'destination']
    )['volume'].fillna(0)
    
    # Without car available
    sm.zones['emission'] = emission[institution] * (1 - sm.zones['car_avail_hh']) * days * 2 * no_car_share
    sm.zones['attraction'] = sm.zones[institution] * (1 - sm.zones['car_avail_hh']) * days * 2 * no_car_share
    vol_df[institution+'_no_car'] = vol_df.merge(
        od_volume_from_zones(sm.zones,
                             get_deterrence('education_no_car',
                                            params['distribution_exponent']['education_no_car']
                                           )),
        how='left', on=['origin', 'destination']
    )['volume'].fillna(0)
    
# Sum up all these education groups
volumes['education_car'] = vol_df[[i+'_car' for i in institutions]].sum(axis=1)
volumes['education_no_car'] = vol_df[[i+'_no_car' for i in institutions]].sum(axis=1)

In [27]:
volumes.sample()

Unnamed: 0,origin,destination,accompany_car,accompany_no_car,buy/execute_car,buy/execute_no_car,leisure_car,leisure_no_car,business_car,education_no_car,commuting_car,education_car,commuting_no_car,business_no_car
1093188,DE22C_2,DEA1F_5,0.007291,0.003571,3.05486e-16,1e-06,2.586243,0.004358,1.560596,5.918683e-207,1.583719e-91,2.5851260000000002e-207,8.206345e-146,0.073071


In [28]:
# Drop rows with NaN
volumes = volumes.loc[~volumes.isna().any(axis=1)]

In [29]:
# check the sum (million trips per day)
print('Million trips per day: {}'.format(volumes[segments].sum().sum() / 1e6 / 365))

Million trips per day: 195.72639454341723


## Sparsify the OD set

Reduce the number of OD pairs to a sample while keeping the number of trips constant.

In [30]:
# Only sample, if the parameter is specified
# and there has been no sampling in previous modelling steps
sample_size = params['general']['od_sparse_sample']
if sample_size > 0 and len(volumes) > sample_size and not scenario.endswith('equilibrium'):
    # Set a stable seed, generating the same OD set in every scenario run
    np.random.seed(42)
    # Choice probabilities of OD pairs weighted by trip volumes
    # That ensures having the most important/intensive OD pairs in the sample
    original_sum = volumes[segments].sum().sum()
    od_probabilities = volumes[segments].sum(axis=1) / original_sum
    sample = np.random.choice(a=volumes.index,
                              size=sample_size,
                              p=od_probabilities)
    # Add a sample without volume weighting for the right distance distribution
    # (high volumes are usually short-distance connections)
    sample2 = np.random.choice(a=volumes.index, size=sample_size)
    '''# Define distance classes to keep the segment-specific distribution
    merged = pd.merge(volumes, distances.fillna(1000), how='left',
                      left_on=['origin', 'destination'], right_index=True)
    bins = list(range(0, 1011, 10))
    merged['bin'] = pd.cut(merged['length'], bins=bins, labels=bins[1:-1])
    for seg in segments:
        # Reduce the volumes matrix
        expansion = merged.groupby('bin')[seg].sum() / sample_size
        sparse = pd.Series(sample).value_counts().rename(seg)
        sparse = pd.merge(sparse, merged['bin'], how='left',
                          left_index=True, right_index=True)
        #volumes[seg] =  * expansion'''
    volumes = volumes.loc[set(sample).union(set(sample2))]
    volumes[segments] *= original_sum / volumes[segments].sum().sum()

In [31]:
# Check the sum (million trips per day)
print('Million trips per day after sampling: {}'.format(volumes[segments].sum().sum() / 1e6 / 365))

Million trips per day after sampling: 195.72639454341723


In [32]:
volumes = volumes.loc[~volumes.isna().all(axis=1)]
print('Number of OD pairs: {}'.format(len(volumes)))

Number of OD pairs: 4930600


## Save

In [33]:
# inter-zonal trips (billion per year)
volumes.loc[volumes['origin']!=volumes['destination'], segments].sum().sum() / 1e9

26.293549330886528

In [34]:
# Save volumes table
sm.volumes = volumes.reset_index(drop=True)
sm.to_zippedpickles(model_path + 'de_volumes', only_attributes=['volumes'])

volumes: 100%|███████████████████████████████████████████████████████████████████████████| 9/9 [00:24<00:00,  2.67s/it]
