# The cross-nested logit model

## Specification of the utility functions

In [1]:
import pandas as pd
import pickle
from urllib.request import urlopen
import biogeme.biogeme as bio
import biogeme.database as db
import biogeme.models as models
import biogeme.optimization as opt
import biogeme.version as ver
import biogeme.results as res
from biogeme.expressions import Beta, log, DefineVariable
from scipy.stats import chi2

Version of Biogeme

In [2]:
print(ver.getText())

biogeme 3.2.8 [2021-09-01]
Version entirely written in Python
Home page: http://biogeme.epfl.ch
Submit questions to https://groups.google.com/d/forum/biogeme
Michel Bierlaire, Transport and Mobility Laboratory, Ecole Polytechnique Fédérale de Lausanne (EPFL)



The estimation of the models may take some time. If you prefer to read the estimation results from a file, set the following variable to `True`

In [3]:
read_results_from_file = True

In [4]:
url_root = (
    'https://courses.edx.org/asset-v1:EPFLx+ChoiceModels2x+3T2021+type@asset+block@'
)

In [5]:
df = pd.read_csv(f'{url_root}airline.dat', sep='\t')
database = db.Database('airline', df)

# The following statement allows you to use the names of the
# variable as Python variable.
globals().update(database.variables)

# Removing some observations
exclude = ArrivalTimeHours_1 == -1
database.remove(exclude)

# Definition of new variables

chosenAlternative = (
    (BestAlternative_1 * 1) +
    (BestAlternative_2 * 2) +
    (BestAlternative_3 * 3)
)

DepartureTimeSensitive = DefineVariable(
    'DepartureTimeSensitive',
    q11_DepartureOrArrivalIsImportant == 1,
    database
)
ArrivalTimeSensitive = DefineVariable(
    'ArrivalTimeSensitive',
    q11_DepartureOrArrivalIsImportant == 2,
    database
)
Missing = DefineVariable(
    'Missing',
    (q11_DepartureOrArrivalIsImportant != 1)
    * (q11_DepartureOrArrivalIsImportant != 2),
    database,
)

DesiredDepartureTime = DefineVariable(
    'DesiredDepartureTime',
    q12_IdealDepTime,
    database
)
DesiredArrivalTime = DefineVariable(
    'DesiredArrivalTime',
    q13_IdealArrTime,
    database
)
ScheduledDelay_1 = DefineVariable(
    'ScheduledDelay_1',
    (DepartureTimeSensitive * (DepartureTimeMins_1 - DesiredDepartureTime))
    + (ArrivalTimeSensitive * (ArrivalTimeMins_1 - DesiredArrivalTime)),
    database,
)

ScheduledDelay_2 = DefineVariable(
    'ScheduledDelay_2',
    (DepartureTimeSensitive * (DepartureTimeMins_2 - DesiredDepartureTime))
    + (ArrivalTimeSensitive * (ArrivalTimeMins_2 - DesiredArrivalTime)),
    database,
)

ScheduledDelay_3 = DefineVariable(
    'ScheduledDelay_3',
    (DepartureTimeSensitive * (DepartureTimeMins_3 - DesiredDepartureTime))
    + (ArrivalTimeSensitive * (ArrivalTimeMins_3 - DesiredArrivalTime)),
    database,
)

Opt1_SchedDelayEarly = DefineVariable(
    'Opt1_SchedDelayEarly',
    (-(ScheduledDelay_1) * (ScheduledDelay_1 < 0)) / 60,
    database,
)
Opt2_SchedDelayEarly = DefineVariable(
    'Opt2_SchedDelayEarly',
    (-(ScheduledDelay_2) * (ScheduledDelay_2 < 0)) / 60,
    database,
)
Opt3_SchedDelayEarly = DefineVariable(
    'Opt3_SchedDelayEarly',
    (-(ScheduledDelay_3) * (ScheduledDelay_3 < 0)) / 60,
    database,
)

Opt1_SchedDelayLate = DefineVariable(
    'Opt1_SchedDelayLate',
    (ScheduledDelay_1 * (ScheduledDelay_1 > 0)) / 60,
    database,
)
Opt2_SchedDelayLate = DefineVariable(
    'Opt2_SchedDelayLate',
    (ScheduledDelay_2 * (ScheduledDelay_2 > 0)) / 60,
    database,
)
Opt3_SchedDelayLate = DefineVariable(
    'Opt3_SchedDelayLate',
    (ScheduledDelay_3 * (ScheduledDelay_3 > 0)) / 60,
    database,
)

purpose_business = DefineVariable(
    'purpose_business', q02_TripPurpose == 1, database
)

purpose_leisure = DefineVariable(
    'purpose_leisure', q02_TripPurpose == 2, database
)

purpose_conf = DefineVariable('purpose_conf', q02_TripPurpose == 3, database)

purpose_business_leisure = DefineVariable(
    'purpose_business_leisure', q02_TripPurpose == 4, database
)

purpose_unknown = DefineVariable(
    'purpose_unknown', q02_TripPurpose == 0, database
)

# Parameters to be estimated

ASC_SAME = Beta('ASC_SAME', 0, None, None, 0)
ASC_MULTIPLE = Beta('ASC_MULTIPLE', 0, None, None, 0)

FARE_business = Beta('FARE_business', 0, None, None, 0)
FARE_leisure = Beta('FARE_leisure', 0, None, None, 0)
FARE_conf = Beta('FARE_conf', 0, None, None, 0)
FARE_business_leisure = Beta('FARE_business_leisure', 0, None, None, 0)
FARE_unknown = Beta('FARE_unknown', 0, None, None, 0)

FARE = (
    FARE_business * purpose_business
    + FARE_leisure * purpose_leisure
    + FARE_conf * purpose_conf
    + FARE_business_leisure * purpose_business_leisure
    + FARE_unknown * purpose_unknown
)

LEGROOM = Beta('LEGROOM', 0, None, None, 0)
EARLY = Beta('EARLY', 0, None, None, 0)
LATE = Beta('LATE', 0, None, None, 0)
TIME = Beta('TIME', 0, None, None, 0)


# Definition of the utility functions
V1 = (
    FARE * Fare_1 ** 0.5
    + LEGROOM * log(Legroom_1)
    + EARLY * Opt1_SchedDelayEarly
    + LATE * Opt1_SchedDelayLate
    + TIME * TripTimeHours_1 ** 0.5
)

V2 = (
    ASC_SAME
    + FARE * Fare_2 ** 0.5
    + LEGROOM * log(Legroom_2)
    + EARLY * Opt2_SchedDelayEarly
    + LATE * Opt2_SchedDelayLate
    + TIME * TripTimeHours_2 ** 0.5
)

V3 = (
    ASC_MULTIPLE
    + FARE * Fare_3 ** 0.5
    + LEGROOM * log(Legroom_3)
    + EARLY * Opt3_SchedDelayEarly
    + LATE * Opt3_SchedDelayLate
    + TIME * TripTimeHours_3 ** 0.5
)


# Associate utility functions with the numbering of alternatives
V = {1: V1, 2: V2, 3: V3}


## Logit model

In [6]:
logprob = models.loglogit(V, None, chosenAlternative)
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'logit'
if read_results_from_file:
    pickle_file = f'{url_root}logit16b.pickle'
    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    logit_results = res.bioResults(data)
else:
    logit_results = biogeme.estimate(algorithm=opt.bioNewton)

In [7]:
print(logit_results.shortSummary())

Results for model logit
Nbr of parameters:		11
Sample size:			3609
Excluded data:			0
Final log likelihood:		-2240.898
Akaike Information Criterion:	4503.795
Bayesian Information Criterion:	4571.898



In [8]:
logit_table = logit_results.getEstimatedParameters()
logit_table

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_MULTIPLE,-1.501675,0.1249,-12.022971,0.0,0.123471,-12.162153,0.0
ASC_SAME,-1.290909,0.124351,-10.38116,0.0,0.124518,-10.367286,0.0
EARLY,-0.145136,0.016,-9.071224,0.0,0.017022,-8.526447,0.0
FARE_business,-0.500072,0.057478,-8.700167,0.0,0.057607,-8.680717,0.0
FARE_business_leisure,-0.768413,0.081292,-9.452525,0.0,0.083804,-9.169114,0.0
FARE_conf,-1.061178,0.181203,-5.85628,4.7335e-09,0.190717,-5.564154,2.634273e-08
FARE_leisure,-0.991822,0.035474,-27.959228,0.0,0.039596,-25.04857,0.0
FARE_unknown,-1.742987,0.973286,-1.790828,0.07332091,0.800198,-2.178196,0.02939147
LATE,-0.107473,0.01301,-8.261048,2.220446e-16,0.014046,-7.651616,1.976197e-14
LEGROOM,0.495089,0.054564,9.073475,0.0,0.058027,8.532109,0.0


## Nested logit

There are three possibilities to partition the choice set:

- [Non stop, One stop-same airline] and [One stop-multiple airlines],
- [Non stop] and [One stop-same airline, One stop-multiple airlines],
- [Non stop, One stop-multiple airlines] and [One stop-same airline].

The first groups the alternatives corresponding to the same airline. The second groups the alternatives with one stop. The third one being less intuitive, we select the two first specifications. 
 

### Nested logit: one stop

We specify a nested logit model where the two alternatives corresponding to "one stop" are in the same nest

In [9]:
MU = Beta('MU', 1, None, None, 0)
onestop = MU, [2, 3]
nonstop = 1.0, [1]
nests = nonstop, onestop
logprob = models.lognested(V, None, nests, chosenAlternative)
if read_results_from_file:
    pickle_file = f'{url_root}nested_onestop16b.pickle'
    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    nested_onestop_results = res.bioResults(data)
else:
    biogeme = bio.BIOGEME(database, logprob)
    biogeme.modelName = 'nested_onestop'
    nested_onestop_results = biogeme.estimate(algorithm=opt.bioNewton)

In [10]:
print(nested_onestop_results.shortSummary())

Results for model nested_onestop
Nbr of parameters:		12
Sample size:			3609
Excluded data:			0
Final log likelihood:		-2221.146
Akaike Information Criterion:	4466.293
Bayesian Information Criterion:	4540.587



In [11]:
nested_onestop_table = nested_onestop_results.getEstimatedParameters()
nested_onestop_table

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_MULTIPLE,-1.257888,0.109933,-11.442325,0.0,0.107684,-11.681344,0.0
ASC_SAME,-1.087574,0.108056,-10.064936,0.0,0.10712,-10.152897,0.0
EARLY,-0.129336,0.014151,-9.139677,0.0,0.014885,-8.689168,0.0
FARE_business,-0.441798,0.05098,-8.666121,0.0,0.049646,-8.898983,0.0
FARE_business_leisure,-0.691903,0.075362,-9.181093,0.0,0.076961,-8.99027,0.0
FARE_conf,-0.96476,0.17049,-5.658742,1.524867e-08,0.169072,-5.70621,1.155193e-08
FARE_leisure,-0.870038,0.038328,-22.699596,0.0,0.041581,-20.924117,0.0
FARE_unknown,-1.525166,0.92085,-1.656259,0.09766941,0.736409,-2.071087,0.0383507
LATE,-0.088153,0.011492,-7.670926,1.709743e-14,0.012388,-7.115916,1.111777e-12
LEGROOM,0.41998,0.04889,8.590295,0.0,0.052119,8.058023,6.661338e-16


The nest parameter is larger than one, consistently with the theory. 

In [12]:
mu = nested_onestop_table.loc['MU', 'Value']
mu

1.6002251614912375

If we test the null hypothesis that the true value of MU is 1, we use a $t$-test:

In [13]:
mu_stderr = nested_onestop_table.loc['MU', 'Rob. Std err']
tested_value = 1
ttest = (tested_value - mu) / mu_stderr
ttest

-4.625747500955246

Therefore, we can reject the null hypothesis at the 5% level. It means that we reject logit.

We can also test the null hypothesis that the two models are equivalent using a likelihood ratio test: 

In [14]:
LL_logit = logit_results.data.logLike
LL_nested_onestop = nested_onestop_results.data.logLike
LR = -2 * (LL_logit - LL_nested_onestop)
LR

39.5027841566216

Number of degrees of freedom:

In [15]:
dof = nested_onestop_results.data.nparam - logit_results.data.nparam
dof

1

The threshold value of the $\chi$-square test at 5% level is:

In [16]:
chi2.isf(0.05, dof)

3.8414588206941285

Therefore, the null hypothesis can be rejected, and the nested logit model is preferred. 

### Nested logit: same airline

We specify a nested logit model where the two alternatives corresponding to "same alternative" are in the same nest

In [17]:
MU = Beta('MU', 1, None, None, 0)
same = MU, [1, 2]
multiple = 1.0, [3]
nests = same, multiple
logprob = models.lognested(V, None, nests, chosenAlternative)
if read_results_from_file:
    pickle_file = f'{url_root}nested_same16b.pickle'
    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    nested_same_results = res.bioResults(data)
else:
    biogeme = bio.BIOGEME(database, logprob)
    biogeme.modelName = 'nested_same'
    nested_same_results = biogeme.estimate(algorithm=opt.bioNewton)

In [18]:
print(nested_same_results.shortSummary())

Results for model nested_same
Nbr of parameters:		12
Sample size:			3609
Excluded data:			0
Final log likelihood:		-2239.591
Akaike Information Criterion:	4503.183
Bayesian Information Criterion:	4577.477



In [19]:
nested_same_table = nested_same_results.getEstimatedParameters()
nested_same_table

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_MULTIPLE,-1.494931,0.130743,-11.434153,0.0,0.12978,-11.518927,0.0
ASC_SAME,-1.405625,0.15106,-9.305079,0.0,0.151512,-9.277309,0.0
EARLY,-0.154015,0.017739,-8.682418,0.0,0.018907,-8.145877,4.440892e-16
FARE_business,-0.528442,0.062934,-8.396797,0.0,0.062968,-8.392248,0.0
FARE_business_leisure,-0.81489,0.090209,-9.033311,0.0,0.093356,-8.728858,0.0
FARE_conf,-1.123253,0.194778,-5.766838,8.077261e-09,0.19785,-5.677293,1.36843e-08
FARE_leisure,-1.046398,0.049973,-20.939274,0.0,0.052825,-19.80874,0.0
FARE_unknown,-1.788409,1.034386,-1.728957,0.08381672,0.897125,-1.993489,0.04620794
LATE,-0.11287,0.014089,-8.011422,1.110223e-15,0.01503,-7.509832,5.928591e-14
LEGROOM,0.524989,0.060546,8.670972,0.0,0.063975,8.206165,2.220446e-16


The nest parameter is less than one, and the model is invalid.

In [20]:
mu = nested_same_table.loc['MU', 'Value']
mu

0.9031669052663268

We re-estimate the model while imposing the nest parameter to take valid values. 

In [21]:
MU = Beta('MU', 1, 1, None, 0)
same = MU, [1, 2]
multiple = 1.0, [3]
nests = same, multiple
logprob = models.lognested(V, None, nests, chosenAlternative)
if read_results_from_file:
    pickle_file = f'{url_root}nested_same_constrained16b.pickle'
    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    nested_same_results = res.bioResults(data)
else:
    biogeme = bio.BIOGEME(database, logprob)
    biogeme.modelName = 'nested_same_constrained'
    nested_same_results = biogeme.estimate(algorithm=opt.bioNewton)

In [22]:
print(nested_same_results.shortSummary())

Results for model nested_same_constrained
Nbr of parameters:		12
Sample size:			3609
Excluded data:			0
Final log likelihood:		-2240.898
Akaike Information Criterion:	4505.795
Bayesian Information Criterion:	4580.089



In [23]:
nested_same_table = nested_same_results.getEstimatedParameters()
nested_same_table

Unnamed: 0,Value,Active bound,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_MULTIPLE,-1.501663,0.0,0.124962,-12.016921,0.0,0.123767,-12.132969,0.0
ASC_SAME,-1.290907,0.0,0.143745,-8.980557,0.0,0.146204,-8.829468,0.0
EARLY,-0.145137,0.0,0.017035,-8.519894,0.0,0.018212,-7.969505,1.554312e-15
FARE_business,-0.500041,0.0,0.060479,-8.268032,2.220446e-16,0.060808,-8.223247,2.220446e-16
FARE_business_leisure,-0.768375,0.0,0.086937,-8.838351,0.0,0.090859,-8.45681,0.0
FARE_conf,-1.061405,0.0,0.185874,-5.710357,1.127396e-08,0.193487,-5.485658,4.119337e-08
FARE_leisure,-0.991824,0.0,0.050792,-19.527322,0.0,0.054628,-18.155824,0.0
FARE_unknown,-1.748582,0.0,0.974559,-1.794229,0.07277669,0.800052,-2.185584,0.02884605
LATE,-0.107473,0.0,0.013503,-7.958942,1.776357e-15,0.01444,-7.442942,9.836576e-14
LEGROOM,0.495055,0.0,0.058056,8.527262,0.0,0.061703,8.023224,1.110223e-15


The nest parameter is exactly one, and the model is equivalent to the logit model.

In [24]:
mu = nested_same_table.loc['MU', 'Value']
mu

1.0

## Cross-nested logit

We estimate a cross-nested logit model with two nests. One with the alternatives corresponding to "one stop", and one with the alternatives corresponding to "same airline". Note that alternative 2 belong to both. We constraint the nest parameters to take only valid values. 

In [25]:
MU_SAME = Beta('MU_SAME', 1, 1, None, 0)
MU_ONESTOP = Beta('MU_ONESTOP', 1, 1, None, 0)
ALPHA_ONESTOP = Beta('ALPHA_ONESTOP', 0.5, 0, 1, 0)
ALPHA_SAME = 1 - ALPHA_ONESTOP

alpha_onestop = {1: 0.0,
                 2: ALPHA_ONESTOP,
                 3: 1.0}

alpha_same = {1: 1.0,
              2: ALPHA_SAME,
              3: 0.0}

nest_same = MU_SAME, alpha_same
nest_onestop = MU_ONESTOP, alpha_onestop
nests = nest_same, nest_onestop

logprob = models.logcnl(V, None, nests, chosenAlternative)

if read_results_from_file:
    pickle_file = f'{url_root}cnl16b.pickle'
    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    cnl_results = res.bioResults(data)
else:
    biogeme = bio.BIOGEME(database, logprob)
    biogeme.modelName = 'cnl'
    cnl_results = biogeme.estimate(algorithm=opt.bioNewton)

In [26]:
print(cnl_results.shortSummary())

Results for model cnl
Nbr of parameters:		14
Sample size:			3609
Excluded data:			0
Final log likelihood:		-2219.47
Akaike Information Criterion:	4466.941
Bayesian Information Criterion:	4553.617



In [27]:
cnl_table = cnl_results.getEstimatedParameters()
cnl_table

Unnamed: 0,Value,Active bound,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ALPHA_ONESTOP,0.753586,0.0,0.148678,5.068571,4.008141e-07,0.129215,5.83202,5.476021e-09
ASC_MULTIPLE,-1.272131,0.0,0.109733,-11.592914,0.0,0.106537,-11.940709,0.0
ASC_SAME,-1.077414,0.0,0.140819,-7.651036,1.998401e-14,0.14775,-7.292126,3.050893e-13
EARLY,-0.129156,0.0,0.015115,-8.544744,0.0,0.016081,-8.031806,8.881784e-16
FARE_business,-0.436846,0.0,0.053477,-8.168873,2.220446e-16,0.052386,-8.338936,0.0
FARE_business_leisure,-0.685015,0.0,0.081274,-8.428497,0.0,0.085448,-8.016748,1.110223e-15
FARE_conf,-0.965359,0.0,0.175928,-5.487238,4.08268e-08,0.17468,-5.526432,3.268096e-08
FARE_leisure,-0.865615,0.0,0.052757,-16.407647,0.0,0.057357,-15.091675,0.0
FARE_unknown,-1.504891,0.0,0.917015,-1.641075,0.1007818,0.72547,-2.074366,0.03804531
LATE,-0.086158,0.0,0.011905,-7.237016,4.587442e-13,0.012894,-6.682002,2.357003e-11


The $\mu$ parameter of each nest is larger or equal to 1, as requested above. 

In [28]:
mu_same = cnl_table.loc['MU_SAME', 'Value']
mu_same

1.0

In [29]:
mu_onestop = cnl_table.loc['MU_ONESTOP', 'Value']
mu_onestop

1.858714007881218

The $\alpha$ parameter is different from 0 and 1.

In [30]:
alpha = cnl_table.loc['ALPHA_ONESTOP', 'Value']
alpha

0.7535855174193948

It is significantly different from 0.

In [31]:
cnl_table.loc['ALPHA_ONESTOP', 'Rob. t-test']

5.8320204246609535

It is significantly different from 1.

In [32]:
alpha_stderr = cnl_table.loc['ALPHA_ONESTOP', 'Rob. Std err']
tested_value = 1
ttest = (tested_value - alpha) / alpha_stderr
ttest

1.907008908907362

We test the null hypothesis that the model is equivalent to the nested logit model estimated before. 

In [33]:
LL_nested_onestop = nested_onestop_results.data.logLike
LL_cnl = cnl_results.data.logLike
LR = -2 * (LL_nested_onestop - LL_cnl)
LR

3.3519699385424246

Number of degrees of freedom:

In [34]:
dof = cnl_results.data.nparam - nested_onestop_results.data.nparam 
dof

2

The threshold value of the $\chi$-square test at 5% level is:

In [35]:
chi2.isf(0.05, dof)

5.991464547107983

Therefore, the null hypothesis cannot be rejected, and the nested logit model is preferred. 