In [1]:
import sys # for automation and parallelisation
manual, scenario = (True, 'base') if 'ipykernel' in sys.argv[0] else (False, sys.argv[1])

In [2]:
import os
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.optimization as opt
import biogeme.messaging as message
from biogeme import expressions as ex
import pandas as pd
import numpy as np
import xlsxwriter
from tqdm import tqdm
from quetzal.io import excel

# Calibration
## Estimation of the model parameters
quetzal_germany is being estimated using [PandasBiogeme](https://biogeme.epfl.ch/). This notebook estimates calibration parameters for the model's utility functions.
- Documentation and reference: [Bierlaire, M. (2020). A short introduction to PandasBiogeme. Technical report TRANSP-OR 200605. Transport and Mobility Laboratory, ENAC, EPFL.](https://transp-or.epfl.ch/documents/technicalReports/Bier20.pdf)
- Tutorial: https://www.youtube.com/watch?v=OiM94B8WayA

In [3]:
input_path = '../input/'
model_path = '../model/'
output_path = '../output/'

In [4]:
# Load parameters for settings
params = excel.read_var(file='../input/parameters.xls', scenario=scenario)

### Prepare the database

In [5]:
df = pd.read_csv(input_path + 'transport_demand/calibration_all_trips_MiD2017.csv')
df = df[['cost_rail_short', 'cost_rail_long', 'cost_car', 'cost_coach', 'cost_bus', 'cost_walk', 'cost_air',
         'time_rail_short', 'time_rail_long', 'time_car', 'time_coach', 'time_bus', 'time_walk', 'time_air',
         'cost_rail', 'cost_road', 'time_rail', 'time_road',
         'mode_model', 'purpose_model', 'purpose2', 'car_avail', 'distance', 'origin', 'destination', 'P_ID', 'W_GEW',
         'urbanisation', 'dest_urban']]
df.columns = ['C_RAIL_S', 'C_RAIL_L', 'C_CAR', 'C_COACH', 'C_BUS', 'C_NON_MOTOR', 'C_AIR',
              'T_RAIL_S', 'T_RAIL_L', 'T_CAR', 'T_COACH', 'T_BUS', 'T_NON_MOTOR', 'T_AIR',
              'C_RAIL', 'C_ROAD', 'T_RAIL', 'T_ROAD',
              'MODE', 'PURPOSE', 'P2', 'CAR_AV', 'DIST', 'O', 'D', 'P_ID', 'W_GEW',
              'O_URBAN', 'D_URBAN']

In [6]:
# The estimation requires numerical purpose values
p_model_dict = {'commuting':1, 'business':2, 'education':3,
                'buy/execute':4, 'leisure':6, 'accompany':7}
df['PURPOSE'] = df['PURPOSE'].apply(lambda s: p_model_dict[s.split('_')[0]])

In [7]:
inf = 10000
df = df.replace({np.inf:inf})

In [8]:
# Scale time to hours
df['T_RAIL'] = df['T_RAIL'] / 60
df['T_RAIL_S'] = df['T_RAIL_S'] / 60
df['T_RAIL_L'] = df['T_RAIL_L'] / 60
df['T_CAR'] = df['T_CAR'] / 60
df['T_COACH'] = df['T_COACH'] / 60
df['T_BUS'] = df['T_BUS'] / 60
df['T_ROAD'] = df['T_ROAD'] / 60
df['T_AIR'] = df['T_AIR'] / 60
df['T_NON_MOTOR'] = df['T_NON_MOTOR'] / 60

In [9]:
# Make car availability binary
df['CAR_AV'] = df['CAR_AV'].replace({9:0})

In [10]:
# Add PT availabilities
df['RAIL_AV'] = (df['T_RAIL']!=inf).astype(int)
df['RAIL_S_AV'] = (df['T_RAIL_S']!=inf).astype(int)
df['RAIL_L_AV'] = (df['T_RAIL_L']!=inf).astype(int)
df['COACH_AV'] = (df['T_COACH']!=inf).astype(int)
df['BUS_AV'] = (df['T_BUS']!=inf).astype(int)
df['ROAD_AV'] = (df['T_ROAD']!=inf).astype(int)
df['AIR_AV'] = (df['T_AIR']!=inf).astype(int)
df['NON_MOTOR_AV'] = (df['DIST']<100).astype(int)

In [11]:
# Clean distances
df = df.loc[df['DIST']<=1000]
df['DIST'] = df['DIST'].astype(int)
len(df)

260723

In [12]:
# Filter for urban trips
df = df.loc[((df['O_URBAN']==1) | (df['D_URBAN']==1)) & (df['DIST']<50)]
df['MODE'].value_counts()

6    94298
4    21594
1     6268
7     3587
3      245
2       52
Name: MODE, dtype: int64

In [13]:
# Merge long- and short-distance
df['MODE'] = df['MODE'].replace({2:1, 3:4})
# drop air
df = df.loc[df['MODE']!=5]

In [14]:
# Remove trips where mode is car but the car availability is zero
# because it irritates the MLE algorithm
mask = ((df['MODE']==6) & (df['CAR_AV']==0))
drop_ratio = len(df.loc[mask])/len(df.loc[df['MODE']==6])
print('Share of car trips dropped: {}'.format(drop_ratio))
df = df.loc[~mask]

Share of car trips dropped: 0.0180597679696282


### Model variables
All columns are variables. DefineVariable creates a new column in the database.

### Create database

This makes all columns become global variables.

In [15]:
df.drop(['O', 'D'], axis=1, inplace=True)

In [16]:
# Create the initial database and make columns global variables
database = db.Database('MiD2017', df.copy())
globals().update(database.variables)
database.getSampleSize()

124341

### Estimation parameters

In [17]:
# Destination choice / attraction
b_time = ex.Beta('b_time', 0, None, None, 0)
b_price = ex.Beta('b_price', 0, None, None, 0)

In [18]:
# ASCs
asc_rail = ex.Beta('asc_rail', 0, None, None, 0)
asc_road = ex.Beta('asc_road', 0, None, None, 0)
asc_car = ex.Beta('asc_car', 0, None, None, 0)
asc_non_motor = ex.Beta('asc_non_motor', 0, None, None, 1) # fixed to 0

In [19]:
# Parameters for the nested logit structure
mu_pt = ex.Beta('mu_pt', 1, 1, 10, 0)

### Utility functions

In [20]:
# Linear-in-the-parameters formulation for every trip purpose
V_RAIL = asc_rail + b_time * T_RAIL + b_price * C_RAIL
V_ROAD = asc_road + b_time * T_ROAD + b_price * C_ROAD
V_CAR = asc_car + b_time * T_CAR + b_price * C_CAR
V_NON_MOTOR = asc_non_motor + b_time * T_NON_MOTOR

# Map utility functions to alternative numbers
V = {1: V_RAIL,
     4: V_ROAD,
     6: V_CAR,
     7: V_NON_MOTOR}

### Run the estimation

In [21]:
# Define level of verbosity
logger = message.bioMessage()
#logger.setSilent()
logger.setWarning()
#logger.setGeneral()
#logger.setDetailed()

In [22]:
# Map the availability of alternatives with MODE as key
av = {1:RAIL_AV,
      4:ROAD_AV,
      6:CAR_AV,
      7:NON_MOTOR_AV}

In [23]:
# Mode nests as tuples with nest name and dictionary where
# alternative IDs are mapped to alpha values. Missing ID's alpha is zero
# Alternatively use lists with mode ID without alpha
pt = mu_pt, [1, 4]#[1, 2, 3, 4]
car = 1, [6]
walk = 1, [7]
nests = pt, car, walk

In [24]:
# Write results to an Excel file
writer = pd.ExcelWriter(input_path + 'estimation_results_urban.xls', engine='xlsxwriter')

In [25]:
# Run the estimation for the whole population
# with and without car
car_name = ['_no_car', '_car']
for car in [0,1]:
    database = db.Database('MiD2017', df.loc[df['CAR_AV']==car].sample(9000))
    # Choose the multinomial logit model
    mnl = models.loglogit(V, av, MODE)
    formulas = {'loglike': mnl, 'weight': W_GEW} # give weights to the estimator
    model_mnl = bio.BIOGEME(database, formulas)
    model_mnl.modelName = 'MNL' + car_name[car]
    model_mnl.generateHtml = False
    model_mnl.generatePickle = False
    results_mnl = model_mnl.estimate()
    results = results_mnl.getEstimatedParameters()
    for key, val in results_mnl.getGeneralStatistics().items():
        results.loc[key] = [val[0], val[1]] + ['' for i in range(len(results.columns)-2)]
    results.to_excel(writer, sheet_name=model_mnl.modelName)
results

  self.data.conditionNumber = self.data.largestEigenValue / self.data.smallestEigenValue


Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
asc_car,4.854284,0.107784,45.036999,0.0,0.11337,42.81824,0.0
asc_rail,1.055257,0.104385,10.109291,0.0,0.107523,9.81423,0.0
asc_road,2.616859,0.092968,28.148002,0.0,0.096082,27.23576,0.0
b_price,-0.196307,0.00921,-21.314226,0.0,0.010198,-19.249328,0.0
b_time,-0.136237,0.017033,-7.99846,0.0,0.017737,-7.68083,0.0
Number of estimated parameters,5.0,,,,,,
Sample size,9000.0,,,,,,
Excluded observations,0.0,,,,,,
Init log likelihood,-10633.002769,.7g,,,,,
Final log likelihood,-4414.48028,.7g,,,,,


In [26]:
# Run the estimation for the whole population
# with and without car
car_name = ['_no_car', '_car']
for car in [0,1]:
    database = db.Database('MiD2017', df.loc[df['CAR_AV']==car].sample(9000))
    # Choose the nested logit model
    nl = models.lognested(V, av, nests, MODE)
    formulas = {'loglike': nl, 'weight': W_GEW} # give weights to the estimator
    model_nl = bio.BIOGEME(database, formulas)
    model_nl.modelName = 'NL' + car_name[car]
    model_nl.generateHtml = False
    model_nl.generatePickle = False
    results_nl = model_nl.estimate()
    results = results_nl.getEstimatedParameters()
    for key, val in results_nl.getGeneralStatistics().items():
        results.loc[key] = [val[0], val[1]] + ['' for i in range(len(results.columns)-2)]
    results.to_excel(writer, sheet_name=model_nl.modelName)
results

Unnamed: 0,Value,Active bound,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
asc_car,5.201893,0.0,0.11773,44.185051,0.0,0.167102,31.130068,0.0
asc_rail,1.265503,0.0,0.717209,1.764484,0.077651,2.481738,0.509926,0.610103
asc_road,2.851503,0.0,0.105187,27.108791,0.0,0.17446,16.344777,0.0
b_price,-0.226124,0.0,0.020424,-11.071515,0.0,0.068631,-3.294761,0.000985
b_time,-0.182437,0.0,0.019597,-9.309299,0.0,0.042618,-4.280788,1.9e-05
mu_pt,1.0,1.0,0.487816,2.049954,0.040369,1.737384,0.575578,0.564901
Number of estimated parameters,6.0,,,,,,,
Sample size,9000.0,,,,,,,
Excluded observations,0.0,,,,,,,
Init log likelihood,-10695.498865,.7g,,,,,,


In [27]:
writer.save()