In [1]:
import pandas as pd
import pickle
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.results as res
import biogeme.models as models
from biogeme.expressions import Beta, log, bioDraws, MonteCarlo

# The data

In [2]:
data = pd.read_table('airline.dat')

In [3]:
database = db.Database('airline',data)
globals().update(database.variables)
exclude = (ArrivalTimeHours_1 == -1)
database.remove(exclude)

# The model 

Variables

In [4]:
chosenAlternative = (
    BestAlternative_1 * 1 +
    BestAlternative_2 * 2 +
    BestAlternative_3 * 3
)
DepartureTimeSensitive = (
    q11_DepartureOrArrivalIsImportant == 1
)
ArrivalTimeSensitive = (
    q11_DepartureOrArrivalIsImportant == 2
)
Missing = (
    (q11_DepartureOrArrivalIsImportant != 1) *
    (q11_DepartureOrArrivalIsImportant != 2)
)
DesiredDepartureTime = q12_IdealDepTime
DesiredArrivalTime = q13_IdealArrTime
ScheduledDelay_1 = (
    DepartureTimeSensitive *
    (DepartureTimeMins_1 - DesiredDepartureTime) +
    ArrivalTimeSensitive *
    (ArrivalTimeMins_1 - DesiredArrivalTime)
)
ScheduledDelay_2  = (
    DepartureTimeSensitive *
    (DepartureTimeMins_2 - DesiredDepartureTime) +
    ArrivalTimeSensitive *
    (ArrivalTimeMins_2 - DesiredArrivalTime)
)
ScheduledDelay_3  = (
    DepartureTimeSensitive *
    (DepartureTimeMins_3 - DesiredDepartureTime) +
    ArrivalTimeSensitive *
    (ArrivalTimeMins_3 - DesiredArrivalTime)
)
Opt1_SchedDelayEarly  = (
    -ScheduledDelay_1 * (ScheduledDelay_1 < 0) / 60
)
Opt2_SchedDelayEarly  = (
    -ScheduledDelay_2 * (ScheduledDelay_2 < 0) / 60
)
Opt3_SchedDelayEarly  = (
    -ScheduledDelay_3 * (ScheduledDelay_3 < 0) / 60
)
Opt1_SchedDelayLate  = (
    ScheduledDelay_1 * (ScheduledDelay_1 > 0) / 60
)
Opt2_SchedDelayLate  = (
    ScheduledDelay_2 * (ScheduledDelay_2 > 0) / 60
)
Opt3_SchedDelayLate  = (
    ScheduledDelay_3 * (ScheduledDelay_3 > 0) / 60
)

Parameters

In [5]:
Constant2 = Beta('Constant2', 0, None, None, 0)
Constant3 = Beta('Constant3', 0, None, None, 0)
Fare = Beta('Fare', 0, None, None, 0)
Legroom = Beta('Legroom', 0, None, None, 0)
SchedDE = Beta('SchedDE', 0, None, None, 0)
SchedDL = Beta('SchedDL', 0, None, None, 0)
Total_TT1 = Beta('Total_TT1', 0, None, None, 0)
Total_TT2 = Beta('Total_TT2', 0, None, None, 0)
Total_TT3 = Beta('Total_TT3', 0, None, None, 0)

Error components

In [6]:
SIGMA_1 = Beta('SIGMA_1', 1, None, None, 0)
EC_1 = SIGMA_1 * bioDraws('EC_1', 'NORMAL')
SIGMA_2 = Beta('SIGMA_2', 1, None, None, 0)
EC_2 = SIGMA_2 * bioDraws('EC_2', 'NORMAL')
SIGMA_3 = Beta('SIGMA_3', 1, None, None, 0)
EC_3 = SIGMA_3 * bioDraws('EC_3', 'NORMAL')

Utility functions. The normalization identifies the id of the alternative such that the error component is not included, which is equivalent to normalize its scale to zero. 

In [7]:
def utility(normalization):
    Opt1 = (Fare * Fare_1 +
            Legroom * Legroom_1 +
            SchedDE * Opt1_SchedDelayEarly +
            SchedDL * Opt1_SchedDelayLate +
            Total_TT1 * TripTimeHours_1)
    if normalization != 1:
        Opt1 += EC_1
    Opt2 = (Constant2 +
            Fare * Fare_2 +
            Legroom * Legroom_2 +
            SchedDE * Opt2_SchedDelayEarly +
            SchedDL * Opt2_SchedDelayLate +
            Total_TT2 * TripTimeHours_2)
    if normalization != 2:
        Opt2 += EC_2
    Opt3 = (Constant3 +
            Fare * Fare_3 +
            Legroom * Legroom_3 +
            SchedDE * Opt3_SchedDelayEarly +
            SchedDL * Opt3_SchedDelayLate +
            Total_TT3 * TripTimeHours_3)
    if normalization != 3:
        Opt3 += EC_3
    return {1: Opt1, 2: Opt2, 3: Opt3}

# Estimation without normalization

For the sake of saving estimation time, we first estimate the model using only 100 draws to illustrate the process. Afterwards, we use pre-calculated results using 10000 draws.

In [8]:
prob = models.logit(utility(None), None, chosenAlternative)
logprob = log(MonteCarlo(prob))
biogeme  = bio.BIOGEME(database, logprob, numberOfDraws=100)

In [9]:
%%time
results_100draws = biogeme.estimate()

CPU times: user 11min 26s, sys: 2.52 s, total: 11min 29s
Wall time: 46.7 s


In [10]:
general_100draws = results_100draws.getGeneralStatistics()
LL_100draws = general_100draws['Final log likelihood'][0]
LL_100draws

-2299.363434912883

In [11]:
betas_100draws = results_100draws.getEstimatedParameters()
betas_100draws

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
Constant2,-2.255699,0.318151,-7.090035,1.340705e-12,0.329155,-6.853003,7.231549e-12
Constant3,-2.521511,0.334979,-7.527374,5.173639e-14,0.356123,-7.08045,1.436851e-12
Fare,-0.028892,0.002047,-14.117188,0.0,0.002445,-11.818815,0.0
Legroom,0.313564,0.038756,8.090722,6.661338e-16,0.040791,7.687143,1.509903e-14
SIGMA_1,2.406643,0.312902,7.691367,1.465494e-14,0.38879,6.190079,6.013416e-10
SIGMA_2,0.149366,0.356873,0.41854,0.6755522,0.279686,0.534048,0.5933083
SIGMA_3,-0.231739,0.423809,-0.5468,0.5845159,0.364353,-0.636028,0.5247585
SchedDE,-0.210243,0.025978,-8.093192,6.661338e-16,0.02865,-7.338315,2.162714e-13
SchedDL,-0.138759,0.019017,-7.296501,2.953193e-13,0.019715,-7.038263,1.946443e-12
Total_TT1,-0.503314,0.10414,-4.833064,1.344477e-06,0.106551,-4.723685,2.316096e-06


We now load the estimation results from file

In [12]:
results = res.bioResults(pickleFile='02asv.pickle')

In [13]:
general = results.getGeneralStatistics()

Check the number of draws

In [14]:
general['Number of draws'][0]

10000

Final log likelihood

In [15]:
LL = general['Final log likelihood'][0]
LL

-2294.2515456698898

Estimated parameters

In [16]:
betas = results.getEstimatedParameters()
betas

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
Constant2,-2.344128,0.329602,-7.111989,1.143752e-12,0.326193,-7.186327,6.654677e-13
Constant3,-2.615196,0.380049,-6.881209,5.934586e-12,0.357343,-7.318457,2.509104e-13
Fare,-0.029853,0.002412,-12.376691,0.0,0.002419,-12.341563,0.0
Legroom,0.31845,0.042213,7.543928,4.551914e-14,0.042414,7.508044,5.995204e-14
SIGMA_1,2.562056,0.324017,7.907154,2.664535e-15,0.346842,7.386817,1.503242e-13
SIGMA_2,0.001777,0.607809,0.002923,0.9976676,0.087294,0.020354,0.9837613
SIGMA_3,0.156254,1.50539,0.103796,0.9173312,1.049248,0.14892,0.8816171
SchedDE,-0.217721,0.028056,-7.760199,8.437695e-15,0.029267,-7.439081,1.014744e-13
SchedDL,-0.143123,0.020728,-6.904671,5.031975e-12,0.02062,-6.940881,3.896661e-12
Total_TT1,-0.52893,0.10973,-4.820296,1.433456e-06,0.109138,-4.846438,1.256981e-06


From the values of the $\sigma$ parameter, we see that $\sigma_2^2$ is lesser than $\sigma_1^2$ and $\sigma_3^3$. Therefore, the correct normalization consists in setting $\sigma_2 = 0$. Note that this conclusion could not be reached with the results obtained with 100 draws. 

We extract the sigmas

In [17]:
sigma1 = betas.loc['SIGMA_1', 'Value']
sigma2 = betas.loc['SIGMA_2', 'Value']
sigma3 = betas.loc['SIGMA_3', 'Value']

Relevant quantities from the variance-covariance matrix: \begin{align}s_3 &= \sigma_1^2 + \sigma_2^2 \\ s_2 &= \sigma_1^2 + \sigma_3^2 \\ s_1 &= \sigma_2^2 + \sigma_3^2\end{align}

In [18]:
s3 = sigma1**2 + sigma2**2

In [19]:
s2 = sigma1**2 + sigma3**2

In [20]:
s1 = sigma2**2 + sigma3**2

In [21]:
relevant = s1, s2, s3

Results with normalization

In [22]:
files = ['03asv_1.pickle', '04asv_2.pickle', '05asv_3.pickle']
normalized_results = [res.bioResults(pickleFile=f) for f in files]

We extract the final log likelihood and the three scale parameters

In [23]:
def extract(r):
    LL = r.getGeneralStatistics()['Final log likelihood'][0]
    try:
        s1 = r.getEstimatedParameters().loc['SIGMA_1', 'Value']
    except KeyError:
        s1 = 0
    try:
        s2 = r.getEstimatedParameters().loc['SIGMA_2', 'Value']
    except KeyError:
        s2 = 0

    try:
        s3 = r.getEstimatedParameters().loc['SIGMA_3', 'Value']
    except KeyError:
        s3 = 0
    return LL, s1, s2, s3
        

In [24]:
allresults = [extract(r) for r in normalized_results]

In [25]:
allresults

[(-2320.4361905355304, 0, 0.005253134534250563, 0.2978510108887043),
 (-2294.972049849141, 2.559972474696234, 0, 0.2679546413379093),
 (-2294.308619390231, 2.5465965295475197, 0.043493143419558705, 0)]

In [26]:
def relevantQuantities(normalization):
    s1 = allresults[normalization-1][2]**2 + allresults[normalization-1][3]**2
    s2 = allresults[normalization-1][1]**2 + allresults[normalization-1][3]**2
    s3 = allresults[normalization-1][1]**2 + allresults[normalization-1][2]**2
    return s1, s2, s3

We now compare the relevant quantities of the variance-covariance matrix

Correct normalization: $\sigma_2 = 0$.

In [27]:
relevantQuantities(2)

(0.07179968981452761, 6.625258761016888, 6.553459071202361)

Same quantities from the unnormalized model

In [28]:
relevant

(0.02441832171963788, 6.588544393247788, 6.564132385224736)

Difference

In [29]:
tuple(a - b for a, b in zip(relevantQuantities(2), relevant))

(0.04738136809488973, 0.03671436776910042, -0.010673314022375813)

The differences are due to simulation errors.

Incorrect normalization: $\sigma_1 = 0$

In [30]:
relevantQuantities(1)

(0.08874282010985798, 0.08871522468742304, 2.7595422434935877e-05)

Difference with unnormalized model

In [31]:
tuple(a - b for a, b in zip(relevantQuantities(1), relevant))

(0.0643244983902201, -6.499829168560365, -6.564104789802301)

The level of magnitude of the differences is higher than for the correct normalization, and cannot be attributed to simulation error.

Incorrect normalization: $\sigma_3=0$

In [32]:
relevantQuantities(3)

(0.0018916535245143027, 6.485153884303472, 6.487045537827986)

Difference with unnormalized model

In [33]:
tuple(a - b for a, b in zip(relevantQuantities(3), relevant))

(-0.022526668195123577, -0.10339050894431612, -0.07708684739675054)

In this case, the differences are lower, and it is difficult to say if they are due to simulation or normalization error. 