In [1]:
import pandas as pd
import pickle
from urllib.request import urlopen
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.results as res
from biogeme.expressions import (
    Beta,
    bioDraws,
    MonteCarlo,
    log,
    PanelLikelihoodTrajectory,
    DefineVariable,
)
import biogeme.messaging as msg
from biogeme import models

The estimation of these models may take a while. It you want to run the notebook using pre-estimated results saved on file, set the number of draws below to `None`

In [2]:
#numberOfDraws = 50000
numberOfDraws = None

In [3]:
def get_results_from_url(file):
    pickle_file = f'{url_root}{file}'

    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    return res.bioResults(data)

In [4]:
url_root = (
    'https://courses.edx.org/'
    'asset-v1:EPFLx+ChoiceModels2x+3T2021+type@asset+block@'
)

# Read the data

In [5]:
theData = pd.read_table(f'{url_root}swissmetro.dat')
database = db.Database('swissmetro', theData)
globals().update(database.variables)


Exclude several observations

In [6]:
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

Declare that the data is organized as panel data. It means that the observations for each individual spans several rows. The identifier for each individual is found in column ``ID``. 

In [7]:
database.panel('ID')

# Model specification

Parameters to be estimated.

In [8]:
ASC_CAR = Beta('ASC_CAR', 0.0, None, None, 0)
ASC_SM = Beta('ASC_SM', 0.0, None, None, 0)
BETA_CAR_COST = Beta('BETA_CAR_COST', 0.0, None, None, 0)
BETA_HE = Beta('BETA_HE', 0.0, None, None, 0)
BETA_SM_COST = Beta('BETA_SM_COST', 0.0, None, None, 0)
BETA_TIME = Beta('BETA_TIME', 0.0, None, None, 0)
BETA_TRAIN_COST = Beta('BETA_TRAIN_COST', 0.0, None, None, 0)

The next set of parameters are the scale parameters of the agent (or panel) effect.

In [9]:
SIGMA_PANEL_CAR = Beta('SIGMA_PANEL_CAR', 1.0, None, None, 0)
SIGMA_PANEL_TRAIN = Beta('SIGMA_PANEL_TRAIN', 1.0, None, None, 0)
SIGMA_PANEL_SM = Beta('SIGMA_PANEL_SM', 1.0, None, None, 0)

Define the random parameters for the egent effect. 

In [10]:
ZERO_SIGMA_PANEL_CAR = SIGMA_PANEL_CAR * bioDraws(
    'ZERO_SIGMA_PANEL_CAR', 'NORMAL'
)
ZERO_SIGMA_PANEL_TRAIN = SIGMA_PANEL_TRAIN * bioDraws(
    'ZERO_SIGMA_PANEL_TRAIN', 'NORMAL'
)
ZERO_SIGMA_PANEL_SM = SIGMA_PANEL_SM * bioDraws(
    'ZERO_SIGMA_PANEL_SM', 'NORMAL'
)


Define new variables. In order to speed up the estimation process, the new variables are stored in new columns of the database, using the expression ``DefineVariable``. It avoids to recalculate them at each iteration of the algorithm. 

In [11]:
CAR_AV_SP = DefineVariable(
    'CAR_AV_SP',
    CAR_AV * (SP != 0),
    database,
)
SM_COST = DefineVariable(
    'SM_COST',
    SM_CO * (GA == 0),
    database,
)
TRAIN_AV_SP = DefineVariable(
    'TRAIN_AV_SP',
    TRAIN_AV * (SP != 0),
    database,
)
TRAIN_COST = DefineVariable(
    'TRAIN_COST',
    TRAIN_CO * (GA == 0),
    database,
)

TRAIN_TT_SCALED = DefineVariable(
    'TRAIN_TT_SCALED',
    TRAIN_TT / 100.0,
    database,
)
TRAIN_COST_SCALED = DefineVariable(
    'TRAIN_COST_SCALED', TRAIN_COST / 100, database
)
SM_TT_SCALED = DefineVariable(
    'SM_TT_SCALED',
    SM_TT / 100.0,
    database,
)
SM_COST_SCALED = DefineVariable(
    'SM_COST_SCALED',
    SM_COST / 100,
    database,
)
CAR_TT_SCALED = DefineVariable(
    'CAR_TT_SCALED',
    CAR_TT / 100,
    database,
)
CAR_CO_SCALED = DefineVariable(
    'CAR_CO_SCALED',
    CAR_CO / 100,
    database,
)
TRAIN_HE_SCALED = DefineVariable(
    'TRAIN_HE_SCALED',
    TRAIN_HE / 100,
    database,
)
SM_HE_SCALED = DefineVariable(
    'SM_HE_SCALED',
    SM_HE / 100,
    database,
)


Utility functions

In [12]:
V_Car_SP = (
    ASC_CAR
    + BETA_TIME * CAR_TT_SCALED
    + BETA_CAR_COST * CAR_CO_SCALED
    + ZERO_SIGMA_PANEL_CAR
)
V_SBB_SP = (
    BETA_TIME * TRAIN_TT_SCALED
    + BETA_TRAIN_COST * TRAIN_COST_SCALED
    + BETA_HE * TRAIN_HE_SCALED
    + ZERO_SIGMA_PANEL_TRAIN
)
V_SM_SP = (
    ASC_SM
    + BETA_TIME * SM_TT_SCALED
    + BETA_SM_COST * SM_COST_SCALED
    + BETA_HE * SM_HE_SCALED
    + ZERO_SIGMA_PANEL_SM
)
#
V = {3: V_Car_SP, 1: V_SBB_SP, 2: V_SM_SP}
av = {3: CAR_AV_SP, 1: TRAIN_AV_SP, 2: SM_AV}


The choice probability for each observation is given by a logit model, conditional on the agent effect.

In [13]:
obsprob = models.logit(V, av, CHOICE)

The probability for the sequence of choices (aka trajectory) of an individual, conditional on the agent effect. 

In [14]:
condprobIndiv = PanelLikelihoodTrajectory(obsprob)


We integrate using Monte-Carlo simulation, and take the log, to obtain the contribution of each individual to the likelihood function. 

In [15]:
logprob = log(MonteCarlo(condprobIndiv))

Activate messages to see the progress of the algorithm

In [16]:
logger = msg.bioMessage()
logger.setGeneral()

In [17]:
%%time
if numberOfDraws is None:
    results = get_results_from_url('Mixture_SM_panel.pickle')
else:
    biogeme = bio.BIOGEME(database, logprob, numberOfDraws=numberOfDraws)
    biogeme.modelName = 'Mixture_SM_panel'
    results = biogeme.estimate()


CPU times: user 41.5 ms, sys: 8.55 ms, total: 50 ms
Wall time: 312 ms


In [18]:
print(results.printGeneralStatistics())

Number of estimated parameters:	10
Sample size:	752
Observations:	6768
Excluded observations:	3960
Init log likelihood:	-3736.254
Final log likelihood:	-3713.511
Likelihood ratio test for the init. model:	45.48769
Rho-square for the init. model:	0.00609
Rho-square-bar for the init. model:	0.00341
Akaike Information Criterion:	7447.021
Bayesian Information Criterion:	7493.249
Final gradient norm:	3.6287E-03
Number of draws:	50000
Draws generation time:	0:01:29.418267
Types of draws:	['ZERO_SIGMA_PANEL_CAR: NORMAL_MLHS_ANTI', 'ZERO_SIGMA_PANEL_SM: NORMAL_MLHS_ANTI', 'ZERO_SIGMA_PANEL_TRAIN: NORMAL_MLHS_ANTI']
Nbr of threads:	28



We observe that, compare to logit, the final log likelihood has increased from $-5068.559$ to $-3713.511$.

In [19]:
results.getEstimatedParameters()

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-1.294416,0.413941,-3.127051,0.001765691,0.828677,-1.562028,0.1182814
ASC_SM,-0.095552,0.275704,-0.346576,0.7289097,0.529687,-0.180394,0.8568434
BETA_CAR_COST,-2.841743,0.227724,-12.478889,0.0,0.285195,-9.964201,0.0
BETA_HE,-0.989065,0.152939,-6.467064,9.992518e-11,0.162208,-6.097524,1.077238e-09
BETA_SM_COST,-3.203048,0.182669,-17.534725,0.0,0.353632,-9.05757,0.0
BETA_TIME,-3.413484,0.154682,-22.06769,0.0,0.54588,-6.253176,4.021896e-10
BETA_TRAIN_COST,-5.210875,0.311331,-16.737397,0.0,0.933147,-5.584196,2.347836e-08
SIGMA_PANEL_CAR,3.998267,0.222662,17.956648,0.0,0.500403,7.990097,1.332268e-15
SIGMA_PANEL_SM,0.774382,0.244444,3.167924,0.001535316,0.21191,3.654296,0.0002578888
SIGMA_PANEL_TRAIN,3.164597,0.20874,15.160508,0.0,0.300256,10.539652,0.0


We observe that the three parameters associated with the agent effects are significantly different from zero. Note the sign of these parameters is irrelevant. 