In [1]:
import pandas as pd
import pickle
from urllib.request import urlopen
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.results as res
import biogeme.version as ver
from biogeme import models
from biogeme.expressions import Beta, log, exp, bioDraws, MonteCarlo

Version of Biogeme

In [2]:
print(ver.getText())

biogeme 3.2.8 [2021-09-02]
Version entirely written in Python
Home page: http://biogeme.epfl.ch
Submit questions to https://groups.google.com/d/forum/biogeme
Michel Bierlaire, Transport and Mobility Laboratory, Ecole Polytechnique Fédérale de Lausanne (EPFL)



The estimation may take a significant amount of time. Therefore, this notebook can be executed using models that have been estimated. The results are read from file if the variable `numberOfDraws` below is set to `None`.

In [3]:
#numberOfDraws=10000
numberOfDraws = None

The following function obtains the data from the file available online.

In [4]:
url_root = (
    'https://courses.edx.org/'
    'asset-v1:EPFLx+ChoiceModels2x+3T2021+type@asset+block@'
)

In [5]:
def get_results_from_url(file):
    pickle_file = f'{url_root}{file}'

    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    return res.bioResults(data)

# Data

In [6]:
pandas = pd.read_table(f'{url_root}swissmetro.dat')
database = db.Database('swissmetro', pandas)

The following statement allows you to use the names of the variables as Python variable.

In [7]:
globals().update(database.variables)

We exclude some observations

In [8]:
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

# Parameters

In [9]:
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)
B_FR = Beta('B_FR', 0, None, None, 0)

# Variables

In [10]:
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100
TRAIN_COST_SCALED = TRAIN_COST / 100
SM_TT_SCALED = SM_TT / 100
SM_COST_SCALED = SM_COST / 100
CAR_TT_SCALED = CAR_TT / 100
CAR_CO_SCALED = CAR_CO / 100
TRAIN_HE_SCALED = TRAIN_HE / 1000
SM_HE_SCALED = SM_HE / 1000
LOW_INC = INCOME <= 1

# Availability conditions

In [11]:
av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}

# Logit model

## Utility functions

In [12]:
V1 = (ASC_TRAIN +
      B_TIME * TRAIN_TT_SCALED +
      B_COST * TRAIN_COST_SCALED +
      B_FR * TRAIN_HE_SCALED)
V2 = (B_TIME * SM_TT_SCALED +
      B_COST * SM_COST_SCALED +
      B_FR * SM_HE_SCALED)
V3 = (ASC_CAR +
      B_TIME * CAR_TT_SCALED +
      B_COST * CAR_CO_SCALED)
V = {1: V1,
     2: V2,
     3: V3}

## Model

In [13]:
logprob = models.loglogit(V, av, CHOICE)

## Estimation

In [14]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = '01logit'
results_logit = biogeme.estimate()

## Results

In [15]:
stats_logit = results_logit.getGeneralStatistics()
print(results_logit.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-5315.386
Final log likelihood:	-5315.386
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.000941
Akaike Information Criterion:	10640.77
Bayesian Information Criterion:	10674.87
Final gradient norm:	8.1247E-03
Nbr of threads:	16



In [16]:
param_logit = results_logit.getEstimatedParameters()
param_logit

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-0.261838,0.047307,-5.534875,3.114498e-08,0.061496,-4.257798,2.064506e-05
ASC_TRAIN,-0.451015,0.069678,-6.472835,9.618062e-11,0.09324,-4.837114,1.31738e-06
B_COST,-1.084663,0.051826,-20.929115,0.0,0.068235,-15.895902,0.0
B_FR,-5.35324,0.963865,-5.553932,2.793141e-08,0.983023,-5.44569,5.160495e-08
B_TIME,-1.276782,0.056938,-22.424014,0.0,0.104436,-12.225485,0.0


# Random parameter: normal distribution

Read the results from file

In [17]:
results_normal = get_results_from_url('rc-02normal.pickle')
param_normal = results_normal.getEstimatedParameters()
stats_normal = results_normal.getGeneralStatistics()

# Random parameter: lognormal distribution


Read the results from file

In [18]:
results_lognormal = get_results_from_url('rc-03lognormal.pickle')
param_lognormal = results_lognormal.getEstimatedParameters()
stats_lognormal = results_lognormal.getGeneralStatistics()

# Latent classes

Read the results from file

In [19]:
results_latent = get_results_from_url('rc-04latentClass.pickle')
param_latent = results_latent.getEstimatedParameters()
stats_latent = results_latent.getGeneralStatistics()

# Latent classes with class membership model

Read the results from file

In [20]:
results_latentsocio = get_results_from_url('rc-05latentClass.pickle')
param_latentsocio = results_latentsocio.getEstimatedParameters()
stats_latentsocio = results_latentsocio.getGeneralStatistics()

# Latent classes with random parameter

We consider again two classes in the population. The first class of individuals have considered all variables when making their choice. For them, the specification of the utility function is the same as for the logit model, where the time coefficient is now distributed in the population. 

In [21]:
B_TIME_S = Beta('B_TIME_S', 1, None, None, 0)
B_TIME_RND = B_TIME + B_TIME_S * bioDraws('B_TIME_RND', 'NORMAL')

In [22]:
V1_1 = (ASC_TRAIN +
        B_TIME_RND * TRAIN_TT_SCALED +
        B_COST * TRAIN_COST_SCALED +
        B_FR * TRAIN_HE_SCALED)
V2_1 = (B_TIME * SM_TT_SCALED +
        B_COST * SM_COST_SCALED +
        B_FR * SM_HE_SCALED)
V3_1 = (ASC_CAR +
        B_TIME * CAR_TT_SCALED +
        B_COST * CAR_CO_SCALED)
V_1 = {1: V1_1,
       2: V2_1,
       3: V3_1}

The second class of individuals ignored the travel time variable when making the choice. Therefore, this variable is removed from the utility function.

In [23]:
V1_2 = (ASC_TRAIN +
        B_COST * TRAIN_COST_SCALED +
        B_FR * TRAIN_HE_SCALED)
V2_2 = (B_COST * SM_COST_SCALED +
        B_FR * SM_HE_SCALED)
V3_2 = (ASC_CAR +
        B_COST * CAR_CO_SCALED)
V_2 = {1: V1_2,
       2: V2_2,
       3: V3_2}

The following parameters are involved in the class membership model.

In [24]:
G_INTERCEPT = Beta('G_INTERCEPT', 0, None, None, 0)
G_MALE = Beta('G_MALE', 0, None, None, 0)
G_GA = Beta('G_GA', 0, None, None, 0)
G_PURP3 = Beta('G_PURP3', 0, None, None, 0)
G_LOW_INC = Beta('G_LOW_INC', 0, None, None, 0)
G_FIRST = Beta('G_FIRST', 0, None, None, 0)

Class membership model. Note that `OMEGA` can potentially take any real value. We have to transform it into a probability using the transform `1 / (1 + exp(OMEGA))`

In [25]:
OMEGA = (
    G_INTERCEPT +
    G_MALE * MALE +
    G_GA * GA +
    G_PURP3 * (PURPOSE == 3) +
    G_LOW_INC * LOW_INC +
    G_FIRST * FIRST
)
prob_class_1 = 1 / (1 + exp(OMEGA))
prob_class_2 = 1 - prob_class_1

## Model

We first calculate the choice probability for each class.

In [26]:
cond_prob_1 = models.logit(V_1, av, CHOICE)
prob_1 = MonteCarlo(cond_prob_1)
prob_2 = models.logit(V_2, av, CHOICE)

The choice probability is obtained by using the class membership model.

In [27]:
prob = prob_class_1 * prob_1 + prob_class_2 * prob_2
logprob = log(prob)

## Estimation

In [28]:
if numberOfDraws is None:
    results_latentrandom = get_results_from_url('rc-06mixedLatentClass.pickle')
else:
    biogeme = bio.BIOGEME(database, logprob)
    biogeme.modelName = '06mixedLatentClass'
    results_latentrandom = biogeme.estimate()
    print(f'Results saved in file {results_latentrandom.data.pickleFileName}')

## Results

In [29]:
stats_latentrandom = results_latentrandom.getGeneralStatistics()
print(results_latentrandom.printGeneralStatistics())

Number of estimated parameters:	12
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-6918.84
Final log likelihood:	-4928.17
Likelihood ratio test for the init. model:	3981.34
Rho-square for the init. model:	0.288
Rho-square-bar for the init. model:	0.286
Akaike Information Criterion:	9880.341
Bayesian Information Criterion:	9962.18
Final gradient norm:	1.1539E-02
Number of draws:	10000
Draws generation time:	0:01:53.447179
Types of draws:	['B_TIME_RND: NORMAL']
Nbr of threads:	24



In [30]:
param_latentrandom = results_latentrandom.getEstimatedParameters()
param_latentrandom

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,0.039294,0.053988,0.727823,0.4667218,0.058842,0.667781,0.5042734
ASC_TRAIN,-0.177545,0.077498,-2.290957,0.02196588,0.083333,-2.130538,0.03312722
B_COST,-1.460821,0.064688,-22.582688,0.0,0.095558,-15.287212,0.0
B_FR,-6.305167,1.055306,-5.974728,2.304748e-09,1.039252,-6.067023,1.303031e-09
B_TIME,-3.04735,0.165805,-18.379096,0.0,0.179765,-16.951841,0.0
B_TIME_S,0.637396,0.2978,2.140347,0.03232674,0.248031,2.569829,0.01017489
G_FIRST,-1.040965,0.211713,-4.916877,8.793586e-07,0.22016,-4.72821,2.265079e-06
G_GA,4.257477,0.519906,8.188938,2.220446e-16,0.500534,8.505867,0.0
G_INTERCEPT,-1.551245,0.310602,-4.994315,5.904504e-07,0.383389,-4.046138,5.206963e-05
G_LOW_INC,0.341916,0.230252,1.484962,0.1375539,0.236878,1.443429,0.1488998


# Comparison

We build a summary data frame. We first gather the parameter estimates for each model.

In [31]:
parameters = [
    param_logit, 
    param_normal, 
    param_lognormal, 
    param_latent, 
    param_latentsocio, 
    param_latentrandom
]
parameters_values = [
    pd.DataFrame(df['Value']) 
    for df in parameters
]
summary = pd.concat(parameters_values, 
                    axis='columns')
summary.columns = [
    'Logit', 
    'Normal', 
    'Lognormal',
    'Latent', 
    'Latent with class mbship',
    'Latent with random param.'
]

Then we gather the value of the final log likelihood for each model.

In [32]:
stats = {
    'Logit': stats_logit, 
    'Normal': stats_normal, 
    'Lognormal': stats_lognormal, 
    'Latent': stats_latent, 
    'Latent with class mbship': stats_latentsocio, 
    'Latent with random param.': stats_latentrandom
}

In [33]:
loglike = {
    k: v['Final log likelihood'][0] 
    for k, v in stats.items()
}
loglike_row = pd.Series(data=loglike, 
                        name='Log likelihood')
summary = summary.append(loglike_row)
summary.fillna('')

Unnamed: 0,Logit,Normal,Lognormal,Latent,Latent with class mbship,Latent with random param.
ASC_CAR,-0.261838,0.011932,0.054452,0.00295,0.026593,0.039294
ASC_TRAIN,-0.451015,-0.103973,-0.067536,-0.108823,-0.19006,-0.177545
B_COST,-1.084663,-1.294047,-1.385676,-1.269444,-1.45144,-1.460821
B_FR,-5.35324,-6.371798,-5.969799,-6.119264,-6.287138,-6.305167
B_TIME,-1.276782,-2.27339,0.572966,-2.806484,-2.968873,-3.04735
B_TIME_S,,1.683225,1.245342,,,0.637396
OMEGA,,,,0.748568,,
G_FIRST,,,,,-0.989434,-1.040965
G_GA,,,,,4.213248,4.257477
G_INTERCEPT,,,,,-1.524404,-1.551245
