In [1]:
import pandas as pd
import pickle
from urllib.request import urlopen
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.results as res
from biogeme.expressions import Beta, log, exp, bioDraws, MonteCarlo

The estimation may take a significant amount of time. Therefore, this notebook can be executed using models that have been estimated. The results are read from file if the variable `numberOfDraws` below is set to `None`.

In [2]:
#numberOfDraws=10000
numberOfDraws = None

The following function obtains the data from the file available online.

In [3]:
url_root = (
    'https://courses.edx.org/'
    'asset-v1:EPFLx+ChoiceModels2x+3T2021+type@asset+block@'
)

In [4]:
def get_results_from_url(file):
    pickle_file = f'{url_root}{file}'

    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    return res.bioResults(data)

# Data

In [5]:
pandas = pd.read_table(f'{url_root}swissmetro.dat')
database = db.Database('swissmetro', pandas)

The following statement allows you to use the names of the variables as Python variable.

In [6]:
globals().update(database.variables)

We exclude some observations

In [7]:
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

# Parameters

In [8]:
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)
B_FR = Beta('B_FR', 0, None, None, 0)

# Variables

In [9]:
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100
TRAIN_COST_SCALED = TRAIN_COST / 100
SM_TT_SCALED = SM_TT / 100
SM_COST_SCALED = SM_COST / 100
CAR_TT_SCALED = CAR_TT / 100
CAR_CO_SCALED = CAR_CO / 100
TRAIN_HE_SCALED = TRAIN_HE / 1000
SM_HE_SCALED = SM_HE / 1000
LOW_INC = INCOME <= 1

# Availability conditions

In [10]:
av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}

# Logit model

## Utility functions

In [11]:
V1 = (ASC_TRAIN +
      B_TIME * TRAIN_TT_SCALED +
      B_COST * TRAIN_COST_SCALED +
      B_FR * TRAIN_HE_SCALED)
V2 = (B_TIME * SM_TT_SCALED +
      B_COST * SM_COST_SCALED +
      B_FR * SM_HE_SCALED)
V3 = (ASC_CAR +
      B_TIME * CAR_TT_SCALED +
      B_COST * CAR_CO_SCALED)
V = {1: V1,
     2: V2,
     3: V3}

## Model

In [12]:
logprob = models.loglogit(V, av, CHOICE)

## Estimation

In [13]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = '01logit'
results_logit = biogeme.estimate()

## Results

In [14]:
stats_logit = results_logit.getGeneralStatistics()
print(results_logit.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-5315.386
Final log likelihood:	-5315.386
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.000941
Akaike Information Criterion:	10640.77
Bayesian Information Criterion:	10674.87
Final gradient norm:	8.1247E-03
Nbr of threads:	16



In [15]:
param_logit = results_logit.getEstimatedParameters()
param_logit

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-0.261838,0.047307,-5.534875,3.114498e-08,0.061496,-4.257798,2.064506e-05
ASC_TRAIN,-0.451015,0.069678,-6.472835,9.618062e-11,0.09324,-4.837114,1.31738e-06
B_COST,-1.084663,0.051826,-20.929115,0.0,0.068235,-15.895902,0.0
B_FR,-5.35324,0.963865,-5.553932,2.793141e-08,0.983023,-5.44569,5.160495e-08
B_TIME,-1.276782,0.056938,-22.424014,0.0,0.104436,-12.225485,0.0


# Random parameter: normal distribution

Read the results from file

In [16]:
results_normal = get_results_from_url('rc-02normal.pickle')
param_normal = results_normal.getEstimatedParameters()
stats_normal = results_normal.getGeneralStatistics()

# Random parameter: lognormal distribution


Read the results from file

In [17]:
results_lognormal = get_results_from_url('rc-03lognormal.pickle')
param_lognormal = results_lognormal.getEstimatedParameters()
stats_lognormal = results_lognormal.getGeneralStatistics()

# Latent classes

We consider two classes in the population. The first class of individuals have considered all variables when making their choice. For them, the specification of the utility function is the same as for the logit model. 

In [18]:
V1_1 = (ASC_TRAIN +
        B_TIME * TRAIN_TT_SCALED +
        B_COST * TRAIN_COST_SCALED +
        B_FR * TRAIN_HE_SCALED)
V2_1 = (B_TIME * SM_TT_SCALED +
        B_COST * SM_COST_SCALED +
        B_FR * SM_HE_SCALED)
V3_1 = (ASC_CAR +
        B_TIME * CAR_TT_SCALED +
        B_COST * CAR_CO_SCALED)
V_1 = {1: V1_1,
       2: V2_1,
       3: V3_1}

The second class of individuals ignored the travel time variable when making the choice. Therefore, this variable is removed from the utility function.

In [19]:
V1_2 = (ASC_TRAIN +
        B_COST * TRAIN_COST_SCALED +
        B_FR * TRAIN_HE_SCALED)
V2_2 = (B_COST * SM_COST_SCALED +
        B_FR * SM_HE_SCALED)
V3_2 = (ASC_CAR +
        B_COST * CAR_CO_SCALED)
V_2 = {1: V1_2,
       2: V2_2,
       3: V3_2}

The following parameter captures the probabity to belong to class 1. 

In [20]:
OMEGA = Beta('OMEGA', 0.5, 0, 1, 0)
prob_class_1 = OMEGA
prob_class_2 = 1 - OMEGA

## Model

We first calculate the choice probability for each class.

In [21]:
prob_1 = models.logit(V_1, av, CHOICE)
prob_2 = models.logit(V_2, av, CHOICE)

The choice probability is obtained by using the class membership model.

In [22]:
prob = prob_class_1 * prob_1 + prob_class_2 * prob_2
logprob = log(prob)

## Estimation

In [23]:
if numberOfDraws is None:
    results_latent = get_results_from_url('rc-04latentClass.pickle')
else:
    biogeme = bio.BIOGEME(database, logprob)
    biogeme.modelName = '04latentClass'
    results_latent = biogeme.estimate()
    print(f'Results saved in file {results_latent.data.pickleFileName}')

## Results

In [24]:
stats_latent = results_latent.getGeneralStatistics()
print(results_latent.printGeneralStatistics())

Number of estimated parameters:	6
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-6964.663
Final log likelihood:	-5191.09
Likelihood ratio test for the init. model:	3547.146
Rho-square for the init. model:	0.255
Rho-square-bar for the init. model:	0.254
Akaike Information Criterion:	10394.18
Bayesian Information Criterion:	10435.1
Final gradient norm:	1.4933E-02
Nbr of threads:	16



In [25]:
param_latent = results_latent.getEstimatedParameters()
param_latent

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,0.00295,0.054079,0.054551,0.9564965,0.054812,0.05382,0.9570783
ASC_TRAIN,-0.108823,0.078194,-1.391698,0.1640138,0.07871,-1.382577,0.1667946
B_COST,-1.269444,0.061306,-20.706568,0.0,0.085837,-14.788988,0.0
B_FR,-6.119264,1.052546,-5.813776,6.107918e-09,1.054918,-5.800699,6.603907e-09
B_TIME,-2.806484,0.174748,-16.060174,0.0,0.170123,-16.496801,0.0
OMEGA,0.748568,0.021778,34.373045,0.0,0.021524,34.778455,0.0


# Comparison

We build a summary data frame. We first gather the parameter estimates for each model.

In [26]:
parameters = [
    param_logit, 
    param_normal, 
    param_lognormal, 
    param_latent, 
]
parameters_values = [
    pd.DataFrame(df['Value']) 
    for df in parameters
]
summary = pd.concat(parameters_values, 
                    axis='columns')
summary.columns = [
    'Logit', 
    'Normal', 
    'Lognormal',
    'Latent', 
]

Then we gather the value of the final log likelihood for each model.

In [27]:
stats = {
    'Logit': stats_logit, 
    'Normal': stats_normal, 
    'Lognormal': stats_lognormal, 
    'Latent': stats_latent, 
}

In [28]:
loglike = {
    k: v['Final log likelihood'][0] 
    for k, v in stats.items()
}
loglike_row = pd.Series(data=loglike, 
                        name='Log likelihood')
summary = summary.append(loglike_row)
summary.fillna('')

Unnamed: 0,Logit,Normal,Lognormal,Latent
ASC_CAR,-0.261838,0.011932,0.054452,0.00295
ASC_TRAIN,-0.451015,-0.103973,-0.067536,-0.108823
B_COST,-1.084663,-1.294047,-1.385676,-1.269444
B_FR,-5.35324,-6.371798,-5.969799,-6.119264
B_TIME,-1.276782,-2.27339,0.572966,-2.806484
B_TIME_S,,1.683225,1.245342,
OMEGA,,,,0.748568
Log likelihood,-5315.386329,-5197.436873,-5215.046312,-5191.089957
