In [1]:
import sys # for automation and parallelisation
manual, scenario = (True, 'base') if 'ipykernel' in sys.argv[0] else (False, sys.argv[1])

In [2]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
from biogeme import expressions as ex
import pandas as pd
import numpy as np
from quetzal.io import excel

# Calibration
## Estimation of the model parameters
quetzal_germany is being estimated using [PandasBiogeme](https://biogeme.epfl.ch/). This notebook estimates calibration parameters for the model's utility functions.
- Documentation and reference: [Bierlaire, M. (2020). A short introduction to PandasBiogeme. Technical report TRANSP-OR 200605. Transport and Mobility Laboratory, ENAC, EPFL.](https://transp-or.epfl.ch/documents/technicalReports/Bier20.pdf)
- Tutorial: https://www.youtube.com/watch?v=OiM94B8WayA

### Model formulation
The model consists of systematic utility functions, one for each mode. They comprise an alternaive-specific constant (ASC), a distance-dependent part with travel time and cost, and a cost damping function F

$V_{im} = ASC_{im} + F(T_m, b_{c_i}) + C_{m_i} b_{c_i}$

Index i marks the demand group

In [3]:
input_path = '../input/'
model_path = '../model/'

In [4]:
# Load parameters for settings
params = excel.read_var(file='../input/parameters.xls', scenario=scenario)

### Prepare the database

In [5]:
df = pd.read_csv(input_path + 'transport_demand/calibration_all_trips_MiD2017.csv')
df = df[['cost_rail_short', 'cost_rail_long', 'cost_car', 'cost_coach', 'cost_bus', 'cost_walk', 'cost_air',
         'time_rail_short', 'time_rail_long', 'time_car', 'time_coach', 'time_bus', 'time_walk', 'time_air',
         'mode_model', 'purpose_model', 'purpose2', 'car_avail', 'length', 'W_GEW', 'origin', 'destination']]
df.columns = ['C_RAIL_S', 'C_RAIL_L', 'C_CAR', 'C_COACH', 'C_BUS', 'C_NON_MOTOR', 'C_AIR',
              'T_RAIL_S', 'T_RAIL_L', 'T_CAR', 'T_COACH', 'T_BUS', 'T_NON_MOTOR', 'T_AIR',
              'MODE', 'PURPOSE', 'P2', 'CAR_AV', 'DIST', 'W_GEW', 'O', 'D']

In [6]:
# Only inter-zonal trips
df = df.loc[df['O']!=df['D']]
len(df)

104689

In [7]:
# Drop return trips
df = df.loc[~df['P2'].isin([8,9])]
len(df)

62197

In [9]:
# The estimation requires numerical purpose values
p_dict = {'commuting':1, 'business':2, 'education':3, 'shopping':4, 'errands':5, 'leisure':6, 'accompany':7}
if not pd.api.types.is_numeric_dtype(df['PURPOSE']):
    df['PURPOSE'] = df['PURPOSE'].apply(lambda s: p_dict[s.split('_')[0]])

In [10]:
inf = 99999
df = df.replace({np.inf:inf})

In [11]:
# Scale time to hours
df['T_RAIL_S_S'] = df['T_RAIL_S'] / 60
df['T_RAIL_L_S'] = df['T_RAIL_L'] / 60
df['T_CAR_S'] = df['T_CAR'] / 60
df['T_COACH_S'] = df['T_COACH'] / 60
df['T_BUS_S'] = df['T_BUS'] / 60
df['T_AIR_S'] = df['T_AIR'] / 60
df['T_NON_MOTOR_S'] = df['T_NON_MOTOR'] / 60

In [12]:
# Add availabilities
df['CAR_AV'] = df['CAR_AV'].replace({9:0})
df['RAIL_S_AV'] = (df['T_RAIL_S']!=inf).astype(int)
df['RAIL_L_AV'] = (df['T_RAIL_L']!=inf).astype(int)
df['COACH_AV'] = (df['T_COACH']!=inf).astype(int)
df['BUS_AV'] = (df['T_BUS']!=inf).astype(int)
df['AIR_AV'] = (df['T_AIR']!=inf).astype(int)
df['NON_MOTOR_AV'] = (df['T_NON_MOTOR']!=inf).astype(int)

In [13]:
# Remove trips where mode is not available
# because it irritates the MLE algorithm
for i, mode in [(1, 'RAIL_S'),(2, 'RAIL_L'),(3, 'COACH'),(4, 'BUS'),(5, 'AIR'),(6, 'CAR'),(7, 'NON_MOTOR')]:
    df = df.loc[~((df['MODE']==i) & (df[mode+'_AV']==0))]
print(len(df))

61581


### Model variables
All columns are variables. DefineVariable creates a new column in the database.

#### Generalised cost

Both, travel time and monetary cost should be affected by cost damping measures. It is logical to define a generalised cost term `GC` with dimension of time units. This requires definition or estimation of values of time, in order to rescale monetary units, for all segments. Usually, the value of time (VoT) is distance-dependent. In the case of Germany, VoT can be taken from research undertaken within the Federal Government's transport study "Bundesverkehrswegeplan 2030": Axhausen et al. 2015. Ermittlung von Bewertungsansätzen für Reisezeiten und Zuverlässigkeit auf der Basis eines Modells für modale Verlagerungen im nicht-gewerblichen und gewerblichen Personenverkehr für die Bundesverkehrswegeplanung

In [19]:
# VoT from literature, distance-dependent, see cal19
p_Fz_dict = {1:'Fz1', 3:'Fz2', 4:'Fz3',, 5:'Fz3', 2:'Fz4', 6:'Fz6', 7:'Fz6'}
VoT = pd.read_csv(input_path + 'vot.csv', header=[0,1], index_col=0)
VoT.sample(2)

Unnamed: 0_level_0,root,Fz1,Fz2,Fz3,Fz4,Fz6,root,Fz1,Fz2,Fz3,...,Fz2,Fz3,Fz4,Fz6,root,Fz1,Fz2,Fz3,Fz4,Fz6
Unnamed: 0_level_1,all,all,all,all,all,all,PT,PT,PT,PT,...,air,air,air,air,car,car,car,car,car,car
699,20.6463,19.7471,9.18,11.9,27.2028,15.5409,15.5401,11.8834,13.48,16.3134,...,72.6955,72.6955,176.7397,40.8806,20.4167,19.3677,12.1631,25.8141,22.6035,15.5409
67,10.449,10.168,9.18,11.9,11.618,8.797,8.188,6.954,11.316,11.421,...,19.16,19.16,24.47,15.83,10.555,10.234,6.998,13.985,11.661,8.797


In [15]:
# Make distance integer
df['DIST'] = df['DIST'].apply(int)

In [16]:
df = df.loc[df['DIST']<=1000]
len(df)

61581

In [20]:
# Generate generalised cost
VoT = VoT.to_dict()
df['GC_RAIL_S'] = df['T_RAIL_S_S'] + [c / VoT[(p_Fz_dict[p], 'PT')][d]
                                        for c,d,p in zip(df['C_RAIL_S'], df['DIST'], df['PURPOSE'])]
df['GC_RAIL_L'] = df['T_RAIL_L_S'] + [c / VoT[(p_Fz_dict[p], 'PT')][d]
                                      for c,d,p in zip(df['C_RAIL_L'], df['DIST'], df['PURPOSE'])]
df['GC_COACH'] = df['T_COACH_S'] + [c / VoT[(p_Fz_dict[p], 'PT')][d]
                                    for c,d,p in zip(df['C_COACH'], df['DIST'], df['PURPOSE'])]
df['GC_BUS'] = df['T_BUS_S'] + [c / VoT[(p_Fz_dict[p], 'PT')][d]
                                for c,d,p in zip(df['C_BUS'], df['DIST'], df['PURPOSE'])]
df['GC_AIR'] = df['T_AIR_S'] + [c / VoT[(p_Fz_dict[p], 'air')][d]
                                for c,d,p in zip(df['C_AIR'], df['DIST'], df['PURPOSE'])]
df['GC_CAR'] = df['T_CAR_S'] + [c / VoT[(p_Fz_dict[p], 'car')][d]
                                for c,d,p in zip(df['C_CAR'], df['DIST'], df['PURPOSE'])]
df['GC_NON_MOTOR'] = df['T_NON_MOTOR_S']

#### Cost damping

Many modelling studies have shown that cost damping is required in order to flatten the tail of time/cost elasticities, i.e. decrease the impact of long distances on choice results to prevent from overestimation of time/cost parameters. Cost damping represents the property of decreasing marginal utility. It is commonly approached with Box-Cox transformations of generalised cost (usually defined as the sum of travel time and travel expenditures divided by the value of time). Daly (2010) proposes a hybrid function as a sum of the linear term and the narural logarithm of the same. However, the linear term still dominates cost on long distances. Rich (2020), main developer of the Danish National Transport Model, proposes a more complex spline function which successfully manages cost damping and even outperforms the Box-Cox transformation in terms of stability of elasticities.

In [36]:
# The cost damping function from Rich (2020)
c = params['estimation']
def spline(x, beta, c1, c2, Q=3):
    alpha = [0, -beta/2*(ex.log(c1)**3),
             -beta/2*ex.log(c1)*(3*(ex.log(c2)**2)+(ex.log(c1)**2))] # for Q=3
    theta = [1, 3/2*ex.log(c1), 3*ex.log(c1)*ex.log(c2)] # for Q=3
    return (beta*theta[0]*(ex.log(x)**(Q-1+1)) + alpha[0]) * (x<c1) \
    + (beta*theta[1]*(ex.log(x)**(Q-2+1)) + alpha[1]) * (x>=c1)*(x<c2) \
    + (beta*theta[2]*(ex.log(x)**(Q-3+1)) + alpha[2]) * (x>=c2)

In [23]:
# A Box-Cox transformation can be applied with the following code.
# However, it performs worse than other estimations
#tao_t = {}
#for m in ['CAR', 'RAIL', 'ROAD', 'AIR', 'NON_MOTOR']:
#    assert len(df.loc[df['T_'+m+'_S']<=0]) == 0
#    transformed_data, best_tao = stats.boxcox(df.loc[df[m+'_AV']==1, 'T_'+m+'_S'])
#    tao_t[m] = best_tao
#    df.loc[df[m+'_AV']==1, 'T_'+m+'_D'] = transformed_data
#    df.loc[df[m+'_AV']==0, 'T_'+m+'_D'] = inf

### Database creation

In [24]:
# Create the initial database and make columns global variables
database = db.Database('MiD2017', df.copy())
globals().update(database.variables)
database.getSampleSize()

61581

### Estimation parameters

In [25]:
asc_rail_s = ex.Beta('asc_rail_s', 0, None, None, 0)
asc_rail_l = ex.Beta('asc_rail_l', 0, None, None, 0)
asc_coach = ex.Beta('asc_coach', 0, None, None, 0)
asc_bus = ex.Beta('asc_bus', 0, None, None, 0)
asc_air = ex.Beta('asc_air', 0, None, None, 0)
asc_car = ex.Beta('asc_car', 0, None, None, 1)
asc_non_motor = ex.Beta('asc_non_motor', 0, None, None, 0)

In [26]:
b_t = ex.Beta('b_t', 0, None, None, 0)
b_c = ex.Beta('b_c', 0, None, None, 0)
b_gc = ex.Beta('b_gc', 0, None, None, 0)

In [27]:
# Parameters for the nested logit structure
mu_pt = ex.Beta('mu_pt', 1, 1, 10, 0)
mu_rail = ex.Beta('mu_rail', 1, 1, 10, 0)

### Utility functions

In [37]:
# Model specification with spline function transformation - disaggregated
p_dict_rev = dict(zip(p_dict.values(), p_dict.keys()))
car = '_car' # take the knot points for car available
V_RAIL_S = {p: asc_rail_s
              + C_RAIL_S * b_c
              + spline(T_RAIL_S, b_t, c['c1_time_'+p_dict_rev[p]+car]*60, c['c2_time_'+p_dict_rev[p]+car]*60)
          for p in df['PURPOSE'].unique()}
V_RAIL_L = {p: asc_rail_l
              + C_RAIL_L * b_c
              + spline(T_RAIL_L, b_t, c['c1_time_'+p_dict_rev[p]+car]*60, c['c2_time_'+p_dict_rev[p]+car]*60)
          for p in df['PURPOSE'].unique()}
V_COACH = {p: asc_coach
              + C_COACH * b_c
              + spline(T_COACH, b_t, c['c1_time_'+p_dict_rev[p]+car]*60, c['c2_time_'+p_dict_rev[p]+car]*60)
          for p in df['PURPOSE'].unique()}
V_BUS = {p: asc_bus
              + C_BUS * b_c
              + spline(T_BUS, b_t, c['c1_time_'+p_dict_rev[p]+car]*60, c['c2_time_'+p_dict_rev[p]+car]*60)
          for p in df['PURPOSE'].unique()}
V_AIR = {p: asc_air
             + C_AIR * b_c
             + spline(T_AIR, b_t, c['c1_time_'+p_dict_rev[p]+car]*60, c['c2_time_'+p_dict_rev[p]+car]*60)
         for p in df['PURPOSE'].unique()}
V_CAR = {p: asc_car
             + C_CAR * b_c
             + spline(T_CAR, b_t, c['c1_time_'+p_dict_rev[p]+car]*60, c['c2_time_'+p_dict_rev[p]+car]*60)
         for p in df['PURPOSE'].unique()}
V_NON_MOTOR = {p: asc_non_motor 
                   + spline(T_NON_MOTOR, b_t, c['c1_time_'+p_dict_rev[p]+car]*60, c['c2_time_'+p_dict_rev[p]+car]*60)
               for p in df['PURPOSE'].unique()}

# Model specification for business trips
V_RAIL_S[2] = asc_rail_s + spline(T_RAIL_S, b_t, c['c1_time_business'+car]*60, c['c2_time_business'+car]*60)
V_RAIL_L[2] = asc_rail_l + spline(T_RAIL_L, b_t, c['c1_time_business'+car]*60, c['c2_time_business'+car]*60)
V_COACH[2] = asc_coach + spline(T_COACH, b_t, c['c1_time_business'+car]*60, c['c2_time_business'+car]*60)
V_BUS[2] = asc_bus + spline(T_BUS, b_t, c['c1_time_business'+car]*60, c['c2_time_business'+car]*60)
V_AIR[2] = asc_air + spline(T_AIR, b_t, c['c1_time_business'+car]*60, c['c2_time_business'+car]*60)
V_CAR[2] = asc_car + spline(T_CAR, b_t, c['c1_time_business'+car]*60, c['c2_time_business'+car]*60)
V_NON_MOTOR[2] = asc_non_motor + spline(T_NON_MOTOR, b_t, c['c1_time_business'+car]*60, c['c2_time_business'+car]*60)

### Run the estimation

In [39]:
# Map modes to utility functions
V = {p: {1:V_RAIL_S[p],
         2:V_RAIL_L[p],
         3:V_COACH[p],
         4:V_BUS[p],
         5:V_AIR[p],
         6:V_CAR[p],
         7:V_NON_MOTOR[p]}
     for p in df['PURPOSE'].unique()}

In [52]:
# Map the availability of alternatives with MODE as key
# PT is always available
av = {p: {1:RAIL_S_AV,
          2:RAIL_L_AV,
          3:COACH_AV,# * int(p!=1) * int(p!=3),
          4:BUS_AV,
          5:AIR_AV * int(p!=1) * int(p!=4) * int(p!=7) * int(p!=3),
          6:CAR_AV,
          7:NON_MOTOR_AV}
      for p in df['PURPOSE'].unique()}

In [41]:
# Mode nests as tuples with nest name and dictionary where
# alternative IDs are mapped to alpha values. Missing ID's alpha is zero
# Alternatively use lists with mode ID without alpha
pt = mu_pt, [1, 2, 3, 4]
air = 1, [5]
car = 1, [6]
walk = 1, [7]
rail = mu_rail, [1,2]
bus = 1, [4]
coach = 1, [3]
#nests = pt, air, car, walk
nests = rail, bus, coach, car, walk, air

In [54]:
# Run the estimation by purpose and car availability
# Write results to an Excel file
writer = pd.ExcelWriter(input_path + 'estimation_results_mode.xlsx')
car_str = ['_no_car', '_car']
results = []
for p in df['PURPOSE'].unique():
    for car in [0,1]:
        mask = (df['PURPOSE']==p) & (df['CAR_AV']==car)
        database = db.Database('MiD2017', df.loc[mask].copy())
        print('Sample size for purpose {}, car {}: {}'.format(p, car, database.getSampleSize()))
        mnl = models.loglogit(V[p], av[p], MODE) # Choose utility functions and availabilities
        #nl = models.lognested(V[p], av[p], nests, MODE) # Choose utility functions and availabilities
        formulas = {'loglike': mnl, 'weight': W_GEW} # give weights to the estimator
        model = bio.BIOGEME(database, formulas)#, numberOfThreads=10)
        model.generate_html = False
        model.generate_pickle = False
        model.saveIterations = False
        model.modelName = p_dict_rev[p].replace('/', '-') + car_str[car] # Name it
        results.append(model.estimate()) # Estimation
        output = results[-1].getEstimatedParameters()
        # Add results to the Excel file
        for key, val in results[-1].getGeneralStatistics().items():
            output.loc[key] = [val[0], val[1]] + ['' for i in range(len(output.columns)-2)]
        output.to_excel(writer, sheet_name=model.modelName)
writer.close()

The sum of the weights (539.8209388816687) is different from the sample size (382). Multiply the weights by 0.7076420577374755 to reconcile the two.


Sample size for purpose 6, car 0: 382
Sample size for purpose 6, car 1: 16534


The sum of the weights (15294.611741017605) is different from the sample size (16534). Multiply the weights by 1.081034306719834 to reconcile the two.
The sum of the weights (648.3248618976085) is different from the sample size (364). Multiply the weights by 0.5614469248249921 to reconcile the two.


Sample size for purpose 1, car 0: 364
Sample size for purpose 1, car 1: 14584


The sum of the weights (14222.60012610091) is different from the sample size (14584). Multiply the weights by 1.0254102534483733 to reconcile the two.
The sum of the weights (414.61901963966716) is different from the sample size (260). Multiply the weights by 0.6270817007525562 to reconcile the two.


Sample size for purpose 4, car 0: 260
Sample size for purpose 4, car 1: 18879


The sum of the weights (16665.387505945546) is different from the sample size (18879). Multiply the weights by 1.1328269440638403 to reconcile the two.
The sum of the weights (49.03024842422116) is different from the sample size (38). Multiply the weights by 0.7750317655177906 to reconcile the two.


Sample size for purpose 2, car 0: 38


The sum of the weights (2385.1240052385974) is different from the sample size (2843). Multiply the weights by 1.1919715678328426 to reconcile the two.


Sample size for purpose 2, car 1: 2843


The sum of the weights (22.23208426725315) is different from the sample size (18). Multiply the weights by 0.8096406879184594 to reconcile the two.


Sample size for purpose 7, car 0: 18
Sample size for purpose 7, car 1: 5093


The sum of the weights (5157.548863796919) is different from the sample size (5093). Multiply the weights by 0.9874845851194904 to reconcile the two.
The sum of the weights (179.85020327081645) is different from the sample size (67). Multiply the weights by 0.3725322450657014 to reconcile the two.


Sample size for purpose 3, car 0: 67


The sum of the weights (2622.4353532777272) is different from the sample size (2519). Multiply the weights by 0.9605575202651057 to reconcile the two.


Sample size for purpose 3, car 1: 2519
