In [1]:
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.messaging as message
from biogeme.expressions import Beta
import pandas as pd
import numpy as np
import xlsxwriter

# Calibration
## Estimation of the model parameters
quetzal_germany is being estimated using [PandasBiogeme](https://biogeme.epfl.ch/). This notebook estimates calibration parameters for the model's utility functions.
- Documentation and reference: [Bierlaire, M. (2020). A short introduction to PandasBiogeme. Technical report TRANSP-OR 200605. Transport and Mobility Laboratory, ENAC, EPFL.](https://transp-or.epfl.ch/documents/technicalReports/Bier20.pdf)
- Tutorial: https://www.youtube.com/watch?v=OiM94B8WayA

### Model formulation
The model consists of systematic utility functions, one for each mode.

> V_i = ASC + T • b_t_i + C • b_c_i + AC • b_ac_i

Index i marks the demand group. I = {'commuting' (1), 'education' (2), 'shopping/medical' (3), 'business' (4), 'private' (6)}

Note: The cost variable already includes subscriptions

In [2]:
input_path = '../input/'
model_path = '../model/'

### Prepare the database

In [3]:
df = pd.read_csv(input_path + 'transport_demand/calibration_inter-cellular_trips_MiD2017.csv')
df = df[['cost_rail', 'cost_car', 'cost_coach', 'cost_bus', 'cost_walk', 'cost_air',
         'time_rail', 'time_car', 'time_coach', 'time_bus', 'time_walk', 'time_air',
         'accessibility_rail', 'accessibility_car', 'accessibility_coach', 'accessibility_bus',
         'accessibility_walk', 'accessibility_air',
         'mode_model', 'purpose_vp', 'car_avail', 'distance']]
df.columns = ['C_RAIL', 'C_CAR', 'C_COACH', 'C_BUS', 'C_NON_MOTOR', 'C_AIR',
              'T_RAIL', 'T_CAR', 'T_COACH', 'T_BUS', 'T_NON_MOTOR', 'T_AIR',
              'AC_RAIL', 'AC_CAR', 'AC_COACH', 'AC_BUS', 'AC_NON_MOTOR', 'AC_AIR',
              'MODE', 'PURPOSE', 'CAR_AV', 'DIST']

In [4]:
inf = 1e4
df = df.replace({np.inf:inf})

In [5]:
df.describe()

Unnamed: 0,C_RAIL,C_CAR,C_COACH,C_BUS,C_NON_MOTOR,C_AIR,T_RAIL,T_CAR,T_COACH,T_BUS,...,AC_RAIL,AC_CAR,AC_COACH,AC_BUS,AC_NON_MOTOR,AC_AIR,MODE,PURPOSE,CAR_AV,DIST
count,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0,...,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0,100273.0
mean,18.481664,3.320918,3.769617,2.387125,0.0,9654.609025,2916.07795,43.830356,6067.586882,6642.653924,...,0.635343,0.967728,0.635343,0.747655,1.0,0.006943,5.578999,3.639863,0.967908,35.230069
std,185.425367,4.843416,6.01179,0.8594,0.0,1811.117309,4464.763289,48.537441,4815.547333,4692.208473,...,0.280959,0.176722,0.280959,0.299401,0.0,0.058529,1.293119,2.038412,0.180274,56.184954
min,0.0,0.136217,0.32528,0.0,0.0,45.0,3.0,2.0,10.0,5.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,3.04
25%,4.351601,1.081714,1.11815,2.74,0.0,10000.0,69.0,20.0,165.933333,105.566667,...,0.6,1.0,0.6,0.75,1.0,0.0,6.0,1.0,1.0,10.45
50%,7.854662,2.025078,2.033,2.74,0.0,10000.0,105.7,30.0,10000.0,10000.0,...,0.75,1.0,0.75,0.9,1.0,0.0,6.0,3.0,1.0,19.0
75%,12.759937,3.686855,3.8627,2.74,0.0,10000.0,10000.0,45.0,10000.0,10000.0,...,0.85,1.0,0.85,0.9,1.0,0.0,6.0,6.0,1.0,36.1
max,10000.0,106.812632,96.5675,2.74,0.0,10000.0,10000.0,578.166667,10000.0,10000.0,...,0.9,1.0,0.9,0.9,1.0,0.916161,7.0,6.0,9.0,902.5


In [6]:
# Scale time to hours
df['T_RAIL_S'] = df['T_RAIL'] / 60
df['T_CAR_S'] = df['T_CAR'] / 60
df['T_COACH_S'] = df['T_COACH'] / 60
df['T_BUS_S'] = df['T_BUS'] / 60
df['T_AIR_S'] = df['T_AIR'] / 60
df['T_NON_MOTOR_S'] = df['T_NON_MOTOR'] / 60

In [7]:
# Make car availability binary
df['CAR_AV'] = df['CAR_AV'].replace({9:0})

In [8]:
# Remove trips where mode is car but the car availability is zero
# because it irritates the MLE algorithm
mask = ((df['MODE']==6) & (df['CAR_AV']==0))
print('Share of car trips dropped: {}. New number of observations is {}'.format(
    len(df.loc[mask])/len(df.loc[df['MODE']==6]), len(df)))
df = df.loc[~mask]

Share of car trips dropped: 0.012054751023463104. New number of observations is 100273


In [9]:
'''# Create availability column for air transport (drops 50% of air trips)
# All PT modes are accessible by assumption
df['AIR_AV'] = df['AC_AIR'] > 0
df['AIR_AV'] = df['AIR_AV'].astype(int)
# Remove air trips with 0 availability
df = df.loc[~((df['MODE']==5) & (df['AIR_AV']==0))]'''
len(df.index)

99210

In [10]:
# Remove trips where cost are infinity
# Share of drops per mode must be equal, otherwise the calibration is skewed
# Ignore air trips because this dataset has too few observations anyways
max_drop_ratio = 0
lengths = []
for mode, col in zip(range(1,5), ['C_RAIL', 'C_RAIL', 'C_COACH', 'C_BUS']):
    drops = df.loc[((df['MODE']==mode) & (df[col]==inf))].index
    lengths = lengths + list(df.loc[drops, 'DIST'])
    if len(drops) > 0: print('mode ' + str(mode) + ': ' + str(len(drops)) + ' drops')
    max_drop_ratio = max(len(drops) / len(df.loc[df['MODE']==mode]), max_drop_ratio)
print('max_drop_ratio: ' + str(max_drop_ratio))
print('Dropped trips length mean: {}; min: {}; max: {}'.format(
    sum(lengths)/len(lengths), min(lengths), max(lengths)))

mode 2: 34 drops
max_drop_ratio: 0.03850509626274066
Dropped trips length mean: 185.9558823529412; min: 9.0; max: 495.0


In [11]:
# Drop trips with infinite cost
df = df.loc[~((df['MODE']==2) & (df['C_RAIL']==inf))]
# Don't drop coach trips because there are only a few
for m in [1, 4, 6]:
    n_drops = int(max_drop_ratio * len(df.loc[df['MODE']==m]))
    df = df.drop(df.loc[(df['MODE']==m) & (df['DIST']<max(lengths)) &
                        (df['DIST']>min(lengths))].sample(n_drops).index)
    print('mode ' + str(m) + ': ' + str(n_drops) + ' drops')
print('New number of observations: ' + str(len(df.index)))

mode 1: 233 drops
mode 4: 159 drops
mode 6: 3354 drops
New number of observations: 95430


### Model variables
All columns are variables. DefineVariable creates a new column in the database.

In [12]:
database = db.Database('MiD2017', df.copy())

In [13]:
globals().update(database.variables)

In [14]:
database.getSampleSize()

95430

### Estimation parameters

In [37]:
asc_rail = Beta('asc_rail', 0, None, None, 0)
asc_coach = Beta('asc_coach', 0, None, None, 0)
asc_bus = Beta('asc_bus', 0, None, None, 0)
asc_air = Beta('asc_air', 0, None, None, 0)
asc_car = Beta('asc_car', 0, None, None, 1)
asc_non_motor = Beta('asc_non_motor', 0, None, None, 0)

In [16]:
b_t = Beta('b_t', 0, None, None, 0)
b_c = Beta('b_c', 0, None, None, 0)
b_ac = Beta('b_ac', 0, None, None, 0)

In [17]:
# non-linear time component
b_t2 = Beta('b_t2', 0, None, None, 0)

In [18]:
b_t_rail = Beta('b_t_rail', 0, None, None, 0)
b_c_rail = Beta('b_c_rail', 0, None, None, 0)
b_ac_rail = Beta('b_ac_rail', 0, None, None, 0)

In [19]:
b_t_coach = Beta('b_t_coach', 0, None, None, 0)
b_c_coach = Beta('b_c_coach', 0, None, None, 0)
b_ac_coach = Beta('b_ac_coach', 0, None, None, 0)

In [20]:
b_t_bus = Beta('b_t_bus', 0, None, None, 0)
b_c_bus = Beta('b_c_bus', 0, None, None, 0)
b_ac_bus = Beta('b_ac_bus', 0, None, None, 0)

In [21]:
b_t_air = Beta('b_t_air', 0, None, None, 0)
b_c_air = Beta('b_c_air', 0, None, None, 0)
b_ac_air = Beta('b_ac_air', 0, None, None, 0)

In [22]:
b_t_car = Beta('b_t_car', 0, None, None, 0)
b_c_car = Beta('b_c_car', 0, None, None, 0)

In [23]:
b_t_non_motor = Beta('b_t_non_motor', 0, None, None, 0)

In [17]:
# Parameters for the nested logit structure
mu_pt = Beta('mu_pt', 1, 1, 10, 0)

### Utility functions

In [25]:
# Disaggregated formulation - computational expensive
# Not applicable in quetzal as is
V_RAIL = asc_rail + b_t * b_c_rail * T_RAIL_S + b_c_rail * C_RAIL + b_ac_rail * AC_RAIL
V_COACH = asc_coach + b_t * b_c_coach * T_COACH_S + b_c_coach * C_COACH + b_ac_coach * AC_COACH
V_BUS = asc_bus + b_t * b_c_bus * T_BUS_S + b_c_bus * C_BUS + b_ac_bus * AC_BUS
V_AIR = asc_air + b_t * b_c_air * T_AIR_S + b_c_air * C_AIR + b_ac_air * AC_AIR
V_CAR = asc_car + b_t * b_c_car * T_CAR_S + b_c_car * C_CAR
V_NON_MOTOR = asc_non_motor + b_t * T_NON_MOTOR_S

In [26]:
# Aggregated formulation with non-linear perception of travel time
V_RAIL = asc_rail + b_t * T_RAIL_S + b_t2 * T_RAIL_S*T_RAIL_S + b_c * C_RAIL + b_ac * AC_RAIL
V_COACH = asc_coach + b_t * T_COACH_S + b_t2 * T_COACH_S*T_COACH_S + b_c * C_COACH + b_ac * AC_COACH
V_BUS = asc_bus + b_t * T_BUS_S + b_t2 * T_BUS_S*T_BUS_S + b_c * C_BUS + b_ac * AC_BUS
V_AIR = asc_air + b_t * T_AIR_S + b_t2 * T_AIR_S*T_AIR_S + b_c * C_AIR + b_ac * AC_AIR
V_CAR = asc_car + b_t * T_CAR_S + b_t2 * T_CAR_S*T_CAR_S + b_c * C_CAR
V_NON_MOTOR = asc_non_motor + b_t * T_NON_MOTOR_S + b_t2 * T_NON_MOTOR_S*T_NON_MOTOR_S

In [38]:
# Aggregated formulation
V_RAIL = asc_rail + b_t * T_RAIL_S + b_c * C_RAIL + b_ac * AC_RAIL
V_COACH = asc_coach + b_t * T_COACH_S + b_c * C_COACH + b_ac * AC_COACH
V_BUS = asc_bus + b_t * T_BUS_S + b_c * C_BUS + b_ac * AC_BUS
V_AIR = asc_air + b_t * T_AIR_S + b_c * C_AIR + b_ac * AC_AIR
V_CAR = asc_car + b_t * T_CAR_S + b_c * C_CAR
V_NON_MOTOR = asc_non_motor + b_t * T_NON_MOTOR_S

### Run the estimation

In [19]:
# Define level of verbosity
logger = message.bioMessage()
#logger.setSilent()
logger.setWarning()
#logger.setGeneral()
#logger.setDetailed()

In [39]:
# Map modes to utility functions
V = {1:V_RAIL,
     2:V_RAIL,
     3:V_COACH,
     4:V_BUS,
     5:V_AIR,
     6:V_CAR,
     7:V_NON_MOTOR}

In [40]:
# Map the availability of alternatives with MODE as key
# PT is always available
av = {1:1,
      2:1,
      3:1,
      4:1,
      5:1,
      6:CAR_AV,
      7:1}

In [41]:
# Mode nests as tuples with nest name and dictionary where
# alternative IDs are mapped to alpha values. Missing ID's alpha is zero
# Alternatively use lists with mode ID without alpha
nests = ((mu_pt, [1, 2, 3, 4]), # PT
         (1, [5]), # Air is seperated
         (1, [6]), # Car
         (1, [7])) # Non-motorised

In [42]:
# Choose the multinomial logit model
mnl = models.loglogit(V, av, MODE)

In [43]:
model_mnl = bio.BIOGEME(database, mnl)
model_mnl.modelName = 'MNL'

In [44]:
results_mnl = model_mnl.estimate()

In [45]:
results = results_mnl.getEstimatedParameters()
for key, val in results_mnl.getGeneralStatistics().items():
    results.loc[key] = [val[0], val[1]] + ['' for i in range(len(results.columns)-2)]
results

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
asc_air,-0.012297,0.216835,-0.0567092,0.954777,3.84626,-0.00319702,0.997449
asc_car,1.685716,0.064895,25.976,0.0,0.0774748,21.7583,0.0
asc_coach,-0.849664,0.0537009,-15.8221,0.0,0.0429061,-19.8029,0.0
asc_non_motor,-0.575053,0.0734418,-7.83004,4.88498e-15,0.0848443,-6.77774,1.22073e-11
asc_rail,0.599818,0.113528,5.28345,1.26773e-07,0.864855,0.693548,0.487966
b_ac,-0.611921,0.0782185,-7.82322,5.10703e-15,0.0920951,-6.64444,3.04374e-11
b_c,-0.003915,0.00149974,-2.61028,0.00904677,0.0314229,-0.124583,0.900854
b_t,-0.159371,0.0170039,-9.37262,0.0,0.0748356,-2.12962,0.0332033
Number of estimated parameters,8.0,,,,,,
Sample size,36224.0,,,,,,


In [46]:
# Write results to a file
writer = pd.ExcelWriter(input_path + 'estimation_results.xlsx', engine='xlsxwriter')

In [47]:
results.to_excel(writer, sheet_name=model_mnl.modelName)

In [48]:
# Choose the logarithmic nested logit model
nl = models.lognested(V, av, nests, MODE)

In [49]:
# Nested Logit
model_nl = bio.BIOGEME(database, nl)
model_nl.modelName = 'NL'

In [50]:
results_nl = model_nl.estimate()

In [51]:
results = results_nl.getEstimatedParameters()
for key, val in results_nl.getGeneralStatistics().items():
    results.loc[key] = [val[0], val[1]] + ['' for i in range(len(results.columns)-2)]
results

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
asc_air,-3.329158,0.409352,-8.13275,4.44089e-16,0.382284,-8.7086,0.0
asc_car,2.459396,0.0727216,33.8193,0.0,0.0851354,28.8881,0.0
asc_coach,-1.12,0.0762405,-14.6904,0.0,0.0762344,-14.6915,0.0
asc_non_motor,-0.565105,0.0912279,-6.19443,5.84977e-10,0.102236,-5.52744,3.24943e-08
asc_rail,-0.458487,0.0389351,-11.7757,0.0,0.0409267,-11.2026,0.0
b_ac,0.61078,0.0836899,7.29814,2.91767e-13,0.0988782,6.1771,6.52902e-10
b_c,0.000755,0.000160318,4.70985,2.47901e-06,0.000137574,5.48848,4.05409e-08
b_t,-0.734379,0.0230305,-31.8872,0.0,0.0286621,-25.6219,0.0
mu_pt,1.791108,0.0777139,23.0475,0.0,0.078477,22.8233,0.0
Number of estimated parameters,9.0,,,,,,


In [52]:
results.to_excel(writer, sheet_name=model_nl.modelName)

In [53]:
# Run the estimation by purpose
results = []
for p in [1,2,3,4,6]:
    database = db.Database('MiD2017', df.copy())
    database.remove(PURPOSE!=p)
    print('Sample size for purpose {}: {}'.format(p, database.getSampleSize()))
    model = bio.BIOGEME(database, nl) # Choose the model formulation
    model.modelName = 'NL_Fz' + str(p) # Name it
    results.append(model.estimate()) # Estimation
    output = results[-1].getEstimatedParameters()
    # Add results to the Excel file
    for key, val in results[-1].getGeneralStatistics().items():
        output.loc[key] = [val[0], val[1]] + ['' for i in range(len(output.columns)-2)]
    output.to_excel(writer, sheet_name=model.modelName)

Sample size for purpose 1: 24798
Sample size for purpose 2: 3995
Sample size for purpose 3: 25270
Sample size for purpose 4: 5143
Sample size for purpose 6: 36224


In [54]:
writer.save()

In [55]:
# Generate LaTeX code
file = open(input_path + 'estimation_results_LaTeX_code.txt', 'w')
for r in results:
    file.write(r.getLaTeX())
file.close()