# The cross-nested logit model

## Specification of the utility functions

In [1]:
import pandas as pd
import biogeme.biogeme as bio
import biogeme.database as db
import biogeme.models as models
import biogeme.optimization as opt
from biogeme.expressions import Beta, log
from scipy.stats import chi2
import biogeme.version as ver

Version of Biogeme

In [2]:
print(ver.getText())

biogeme 3.2.8 [2021-07-26]
Version entirely written in Python
Home page: http://biogeme.epfl.ch
Submit questions to https://groups.google.com/d/forum/biogeme
Michel Bierlaire, Transport and Mobility Laboratory, Ecole Polytechnique Fédérale de Lausanne (EPFL)



In [3]:
url = (
    'https://courses.edx.org/asset-v1:EPFLx+ChoiceModels2x+3T2021+type@asset+block@'
       'swissmetro.dat'
)
df = pd.read_csv('swissmetro.dat', sep='\t')
database = db.Database('swissmetro', df)

# The following statement allows you to use the names of the
# variable as Python variable.
globals().update(database.variables)

# Removing some observations
exclude = CHOICE == 0
database.remove(exclude)

# Dummy variables variables for segmentation
age_00_24 = AGE == 1
age_25_39 = AGE == 2
age_40_54 = AGE == 3
age_55_65 = AGE == 4
age_65_plus = AGE == 5


female = 1 - MALE
male = MALE
noGA = GA == 0

FIRST_CLASS = FIRST
SECOND_CLASS = FIRST == 0

# Parameters to be estimated
ASC_CAR_MALE = Beta('ASC_CAR_MALE', 0, None, None, 0)
ASC_CAR_FEMALE = Beta('ASC_CAR_FEMALE', 0, None, None, 0)
ASC_CAR = ASC_CAR_MALE * male + ASC_CAR_FEMALE * female

ASC_TRAIN_MALE = Beta('ASC_TRAIN_MALE', 0, None, None, 0)
ASC_TRAIN_FEMALE = Beta('ASC_TRAIN_FEMALE', 0, None, None, 0)
ASC_TRAIN = ASC_TRAIN_MALE * male + ASC_TRAIN_FEMALE * female

B_TIME_CAR = Beta('B_TIME_CAR', 0, None, None, 0)

B_TIME_TRAIN_GA = Beta('B_TIME_TRAIN_GA', 0, None, None, 0)
B_TIME_TRAIN_noGA = Beta('B_TIME_TRAIN_noGA', 0, None, None, 0)
B_TIME_TRAIN = B_TIME_TRAIN_GA * GA + B_TIME_TRAIN_noGA * noGA

B_TIME_SM_GA = Beta('B_TIME_SM_GA', 0, None, None, 0)
B_TIME_SM_noGA = Beta('B_TIME_SM_noGA', 0, None, None, 0)
B_TIME_SM = B_TIME_SM_GA * GA + B_TIME_SM_noGA * noGA

B_COST_CAR_FIRST = Beta('B_COST_CAR_FIRST', 0, None, None, 0)
B_COST_CAR_SECOND = Beta('B_COST_CAR_SECOND', 0, None, None, 0)
B_COST_CAR = B_COST_CAR_FIRST * FIRST_CLASS + B_COST_CAR_SECOND * SECOND_CLASS

B_COST_TRAIN_FIRST = Beta('B_COST_TRAIN_FIRST', 0, None, None, 0)
B_COST_TRAIN_SECOND = Beta('B_COST_TRAIN_SECOND', 0, None, None, 0)
B_COST_TRAIN = B_COST_TRAIN_FIRST * FIRST_CLASS + B_COST_TRAIN_SECOND * SECOND_CLASS

B_COST_SM_FIRST = Beta('B_COST_SM_FIRST', 0, None, None, 0)
B_COST_SM_SECOND = Beta('B_COST_SM_SECOND', 0, None, None, 0)
B_COST_SM = B_COST_SM_FIRST * FIRST_CLASS + B_COST_SM_SECOND * SECOND_CLASS

B_HEADWAY_TRAIN_00_24 = Beta('B_HEADWAY_TRAIN_00_24', 0, None, None, 0)
B_HEADWAY_TRAIN_25_39 = Beta('B_HEADWAY_TRAIN_25_39', 0, None, None, 0)
B_HEADWAY_TRAIN_40_54 = Beta('B_HEADWAY_TRAIN_40_54', 0, None, None, 0)
B_HEADWAY_TRAIN_55_65 = Beta('B_HEADWAY_TRAIN_55_65', 0, None, None, 0)
B_HEADWAY_TRAIN_65_plus = Beta('B_HEADWAY_TRAIN_65_plus', 0, None, None, 0)

B_HEADWAY_TRAIN = B_HEADWAY_TRAIN_00_24 * age_00_24 + \
    B_HEADWAY_TRAIN_25_39 * age_25_39 + \
    B_HEADWAY_TRAIN_40_54 * age_40_54 + \
    B_HEADWAY_TRAIN_55_65 * age_55_65 + \
    B_HEADWAY_TRAIN_65_plus * age_65_plus

B_HEADWAY_SM_00_24 = Beta('B_HEADWAY_SM_00_24', 0, None, None, 0)
B_HEADWAY_SM_25_39 = Beta('B_HEADWAY_SM_25_39', 0, None, None, 0)
B_HEADWAY_SM_40_54 = Beta('B_HEADWAY_SM_40_54', 0, None, None, 0)
B_HEADWAY_SM_55_65 = Beta('B_HEADWAY_SM_55_65', 0, None, None, 0)
B_HEADWAY_SM_65_plus = Beta('B_HEADWAY_SM_65_plus', 0, None, None, 0)

B_HEADWAY_SM = B_HEADWAY_SM_00_24 * age_00_24 + \
    B_HEADWAY_SM_25_39 * age_25_39 + \
    B_HEADWAY_SM_40_54 * age_40_54 + \
    B_HEADWAY_SM_55_65 * age_55_65 + \
    B_HEADWAY_SM_65_plus * age_65_plus

# Definition of new variables
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
TRAIN_TT_SCALED = TRAIN_TT / 60
TRAIN_COST_SCALED = TRAIN_COST / 100
SM_TT_SCALED = SM_TT / 60
SM_COST_SCALED = SM_COST / 100
CAR_TT_SCALED = CAR_TT / 60
CAR_COST_SCALED = CAR_CO / 100

TRAIN_HE_SCALED = TRAIN_HE / 60
SM_HE_SCALED = SM_HE / 60

def piecewise_cost(x):
    """
    Piecewise linear transformation of the variable
    """
    piecewiseVariables = models.piecewiseVariables(x, [0, 0.5, 1, 1.75, None])
    return (piecewiseVariables[0] +
            Beta('pw_cost_0.5_1', 0, None, None, 0) * piecewiseVariables[1] +
            Beta('pw_cost_1_1.75', 0, None, None, 0) * piecewiseVariables[2] +
            Beta('pw_cost_1.75_more', 0, None, None, 0) * piecewiseVariables[3])

# Definition of the utility functions
V1 = ASC_TRAIN + \
     B_TIME_TRAIN * log(TRAIN_TT_SCALED) + \
     B_COST_TRAIN * piecewise_cost(TRAIN_COST_SCALED) + \
     B_HEADWAY_TRAIN * TRAIN_HE**0.5

V2 = B_TIME_SM * log(SM_TT_SCALED) + \
     B_COST_SM * piecewise_cost(SM_COST_SCALED) + \
     B_HEADWAY_SM * SM_HE**0.5

V3 = ASC_CAR + \
     B_TIME_CAR * log(CAR_TT_SCALED) + \
     B_COST_CAR * piecewise_cost(CAR_COST_SCALED)

# Associate utility functions with the numbering of alternatives
V = {1: V1,
     2: V2,
     3: V3}

# Associate the availability conditions with the alternatives
av = {1: TRAIN_AV,
      2: SM_AV,
      3: CAR_AV}

## Nested logit: existing alternatives

In [4]:
MU = Beta('MU', 1, 0, None, 0)
existing = MU, [1, 3]
future = 1.0, [2]
nests = existing, future
logprob = models.lognested(V, av, nests, CHOICE)
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'nested_existing'
nested_existing_results = biogeme.estimate(algorithm=opt.bioNewton)

In [5]:
print(nested_existing_results.shortSummary())

Results for model nested_existing
Nbr of parameters:		29
Sample size:			10719
Excluded data:			9
Final log likelihood:		-7640.153
Akaike Information Criterion:	15338.31
Bayesian Information Criterion:	15549.42



In [6]:
nested_existing_table = nested_existing_results.getEstimatedParameters()
nested_existing_table

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR_FEMALE,-0.990121,0.145137,-6.821987,8.97904e-12,0.156784,-6.315207,2.698004e-10
ASC_CAR_MALE,-0.796609,0.13487,-5.906476,3.495036e-09,0.142695,-5.582607,2.369403e-08
ASC_TRAIN_FEMALE,1.29704,0.175501,7.390483,1.463274e-13,0.179868,7.211056,5.551115e-13
ASC_TRAIN_MALE,0.704092,0.166885,4.219017,2.453701e-05,0.170425,4.131379,3.60593e-05
B_COST_CAR_FIRST,-1.321786,0.216569,-6.103293,1.039053e-09,0.220225,-6.001993,1.949101e-09
B_COST_CAR_SECOND,-0.722385,0.146241,-4.939677,7.825193e-07,0.143127,-5.047164,4.484165e-07
B_COST_SM_FIRST,-1.810188,0.301324,-6.007444,1.884709e-09,0.325671,-5.558334,2.723615e-08
B_COST_SM_SECOND,-1.693399,0.264201,-6.409523,1.459761e-10,0.285987,-5.921243,3.19518e-09
B_COST_TRAIN_FIRST,-1.901102,0.327365,-5.807288,6.349294e-09,0.359676,-5.285603,1.252917e-07
B_COST_TRAIN_SECOND,-1.4846,0.255305,-5.815001,6.063368e-09,0.278476,-5.331156,9.758927e-08


## Cross-nested logit

In [7]:
MU_EXISTING = Beta('MU_EXISTING', 1, None, None, 0)
MU_PUBLIC = Beta('MU_PUBLIC', 1, None, None, 0)

ALPHA_EXISTING = Beta('ALPHA_EXISTING', 0.5, 0, 1, 0)
ALPHA_PUBLIC = 1 - ALPHA_EXISTING

alpha_existing = {1: ALPHA_EXISTING,
                  2: 0.0,
                  3: 1.0}

alpha_public = {1: ALPHA_PUBLIC,
                2: 1.0,
                3: 0.0}

nest_existing = MU_EXISTING, alpha_existing
nest_public = MU_PUBLIC, alpha_public
nests = nest_existing, nest_public

# The choice model is a cross-nested logit, with availability conditions
logprob = models.logcnl_avail(V, av, nests, CHOICE)

# Create the Biogeme object
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'cnl'

# Estimate the parameters
cnl_results = biogeme.estimate(algorithm=opt.bioNewton)

In [8]:
print(cnl_results.shortSummary())

Results for model cnl
Nbr of parameters:		31
Sample size:			10719
Excluded data:			9
Final log likelihood:		-7631.125
Akaike Information Criterion:	15324.25
Bayesian Information Criterion:	15549.92



In [9]:
cnl_table = cnl_results.getEstimatedParameters()
cnl_table

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ALPHA_EXISTING,0.763518,0.025495,29.948041,0.0,0.024495,31.169756,0.0
ASC_CAR_FEMALE,-0.988875,0.142107,-6.95868,3.434808e-12,0.155128,-6.374564,1.834839e-10
ASC_CAR_MALE,-0.789837,0.132081,-5.979954,2.232e-09,0.141674,-5.575049,2.474604e-08
ASC_TRAIN_FEMALE,1.42804,0.157701,9.055368,0.0,0.160774,8.882262,0.0
ASC_TRAIN_MALE,0.880835,0.151338,5.820306,5.873993e-09,0.152131,5.789972,7.039819e-09
B_COST_CAR_FIRST,-1.327071,0.214919,-6.174748,6.626915e-10,0.219074,-6.057631,1.381409e-09
B_COST_CAR_SECOND,-0.729112,0.146069,-4.991573,5.988954e-07,0.14365,-5.075614,3.862457e-07
B_COST_SM_FIRST,-1.820551,0.298962,-6.089568,1.132159e-09,0.32423,-5.614992,1.965713e-08
B_COST_SM_SECOND,-1.70624,0.262387,-6.502772,7.885292e-11,0.28552,-5.975895,2.2883e-09
B_COST_TRAIN_FIRST,-1.89286,0.321373,-5.889912,3.864004e-09,0.353599,-5.353126,8.644768e-08


The $\mu$ parameter of each nest is larger than 1, as requested by the theory. 

In [10]:
mu_existing = cnl_table.loc['MU_EXISTING','Value']
mu_existing

1.211788102150362

In [11]:
mu_public = cnl_table.loc['MU_PUBLIC','Value']
mu_public

9.373390171190906

The $\alpha$ parameter is different from 0 and 1.

In [12]:
alpha = cnl_table.loc['ALPHA_EXISTING','Value']
alpha

0.7635177712297777

It is significantly different from 0.

In [13]:
cnl_table.loc['ALPHA_EXISTING','Rob. t-test']

31.169755993488

It is significantly different from 1.

In [14]:
alpha_stderr = cnl_table.loc['ALPHA_EXISTING', 'Rob. Std err']
tested_value = 1
ttest = (tested_value - alpha) / alpha_stderr
ttest

9.65412154807034

We test the null hypothesis that the two models are equivalent using a likelihood ratio test: 

In [15]:
LL_nested_existing = nested_existing_results.data.logLike
LL_cnl = cnl_results.data.logLike
LR = -2 * (LL_nested_existing - LL_cnl)
LR

18.054196605786274

Number of degrees of freedom:

In [16]:
dof = cnl_results.data.nparam - nested_existing_results.data.nparam 
dof

2

The threshold value of the $\chi$-square test at 5% level is:

In [17]:
chi2.isf(0.05, dof)

5.991464547107983

Therefore, the null hypothesis can be rejected, and the cross-nested logit model is preferred. 