In [1]:
import pandas as pd
import pickle
from urllib.request import urlopen
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.results as res
import biogeme.version as ver
from biogeme.expressions import Beta, log, exp, bioDraws, MonteCarlo

Version of Biogeme

In [2]:
print(ver.getText())

biogeme 3.2.8 [2021-09-02]
Version entirely written in Python
Home page: http://biogeme.epfl.ch
Submit questions to https://groups.google.com/d/forum/biogeme
Michel Bierlaire, Transport and Mobility Laboratory, Ecole Polytechnique Fédérale de Lausanne (EPFL)



The estimation may take a significant amount of time. Therefore, this notebook can be executed using models that have been estimated. The results are read from file if the variable `numberOfDraws` below is set to `None`.

In [3]:
#numberOfDraws=10000
numberOfDraws = None

The following function obtains the data from the file available online.

In [4]:
url_root = (
    'https://courses.edx.org/'
    'asset-v1:EPFLx+ChoiceModels2x+3T2021+type@asset+block@'
)

In [5]:
def get_results_from_url(file):
    pickle_file = f'{url_root}{file}'

    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    return res.bioResults(data)

# Data

In [6]:
pandas = pd.read_table(f'{url_root}swissmetro.dat')
database = db.Database('swissmetro', pandas)

The following statement allows you to use the names of the variables as Python variable.

In [7]:
globals().update(database.variables)

We exclude some observations

In [8]:
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

# Parameters

In [9]:
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)
B_FR = Beta('B_FR', 0, None, None, 0)

# Variables

In [10]:
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100
TRAIN_COST_SCALED = TRAIN_COST / 100
SM_TT_SCALED = SM_TT / 100
SM_COST_SCALED = SM_COST / 100
CAR_TT_SCALED = CAR_TT / 100
CAR_CO_SCALED = CAR_CO / 100
TRAIN_HE_SCALED = TRAIN_HE / 1000
SM_HE_SCALED = SM_HE / 1000
LOW_INC = INCOME <= 1

# Availability conditions

In [11]:
av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}

# Logit model

## Utility functions

In [12]:
V1 = (ASC_TRAIN +
      B_TIME * TRAIN_TT_SCALED +
      B_COST * TRAIN_COST_SCALED +
      B_FR * TRAIN_HE_SCALED)
V2 = (B_TIME * SM_TT_SCALED +
      B_COST * SM_COST_SCALED +
      B_FR * SM_HE_SCALED)
V3 = (ASC_CAR +
      B_TIME * CAR_TT_SCALED +
      B_COST * CAR_CO_SCALED)
V = {1: V1,
     2: V2,
     3: V3}

## Model

In [13]:
logprob = models.loglogit(V, av, CHOICE)

## Estimation

In [14]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = '01logit'
results_logit = biogeme.estimate()

## Results

In [15]:
stats_logit = results_logit.getGeneralStatistics()
print(results_logit.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-5315.386
Final log likelihood:	-5315.386
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.000941
Akaike Information Criterion:	10640.77
Bayesian Information Criterion:	10674.87
Final gradient norm:	8.1247E-03
Nbr of threads:	16



In [16]:
param_logit = results_logit.getEstimatedParameters()
param_logit

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-0.261838,0.047307,-5.534875,3.114498e-08,0.061496,-4.257798,2.064506e-05
ASC_TRAIN,-0.451015,0.069678,-6.472835,9.618062e-11,0.09324,-4.837114,1.31738e-06
B_COST,-1.084663,0.051826,-20.929115,0.0,0.068235,-15.895902,0.0
B_FR,-5.35324,0.963865,-5.553932,2.793141e-08,0.983023,-5.44569,5.160495e-08
B_TIME,-1.276782,0.056938,-22.424014,0.0,0.104436,-12.225485,0.0


# Random parameter: normal distribution

Read the results from file

In [17]:
results_normal = get_results_from_url('rc-02normal.pickle')
param_normal = results_normal.getEstimatedParameters()
stats_normal = results_normal.getGeneralStatistics()

# Random parameter: lognormal distribution


In [18]:
B_TIME_S = Beta('B_TIME_S', 1, None, None, 0)
B_TIME_RND = -exp(B_TIME + B_TIME_S * bioDraws('B_TIME_RND', 'NORMAL'))

## Utility functions

In [19]:
V1 = (ASC_TRAIN +
      B_TIME_RND * TRAIN_TT_SCALED +
      B_COST * TRAIN_COST_SCALED +
      B_FR * TRAIN_HE_SCALED)
V2 = (B_TIME_RND * SM_TT_SCALED +
      B_COST * SM_COST_SCALED +
      B_FR * SM_HE_SCALED)
V3 = (ASC_CAR +
      B_TIME_RND * CAR_TT_SCALED +
      B_COST * CAR_CO_SCALED)
V = {1: V1,
     2: V2,
     3: V3}

## Model

In [20]:
prob = models.logit(V, av, CHOICE)
logprob = log(MonteCarlo(prob))

## Estimation

In [21]:
if numberOfDraws is None:
    results_lognormal = get_results_from_url('rc-03lognormal.pickle')
else:
    biogeme = bio.BIOGEME(database, logprob, numberOfDraws=numberOfDraws)
    biogeme.modelName = '03lognormal'
    results_lognormal = biogeme.estimate()
    print(f'Results saved in file {results_lognormal.data.pickleFileName}')

## Results

In [22]:
stats_lognormal = results_lognormal.getGeneralStatistics()
print(results_lognormal.printGeneralStatistics())

Number of estimated parameters:	6
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-5701.952
Final log likelihood:	-5215.046
Likelihood ratio test for the init. model:	973.8122
Rho-square for the init. model:	0.0854
Rho-square-bar for the init. model:	0.0843
Akaike Information Criterion:	10442.09
Bayesian Information Criterion:	10483.01
Final gradient norm:	1.2361E-02
Number of draws:	10000
Draws generation time:	0:01:43.083423
Types of draws:	['B_TIME_RND: NORMAL']
Nbr of threads:	16



In [23]:
param_lognormal = results_lognormal.getEstimatedParameters()
param_lognormal

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,0.054452,0.061301,0.888268,0.3743967,0.066259,0.821803,0.411189
ASC_TRAIN,-0.067536,0.083656,-0.807315,0.419485,0.087253,-0.77403,0.4389133
B_COST,-1.385676,0.074961,-18.48526,0.0,0.098536,-14.062635,0.0
B_FR,-5.969799,1.052202,-5.673624,1.39808e-08,1.054736,-5.659992,1.513804e-08
B_TIME,0.572966,0.065452,8.753921,0.0,0.0714,8.0247,1.110223e-15
B_TIME_S,1.245342,0.104267,11.943724,0.0,0.136345,9.13374,0.0


# Comparison

We build a summary data frame. We first gather the parameter estimates for the three models.

In [24]:
parameters = [
    param_logit, 
    param_normal, 
    param_lognormal, 
]
parameters_values = [
    pd.DataFrame(df['Value']) 
    for df in parameters
]
summary = pd.concat(parameters_values, 
                    axis='columns')
summary.columns = ['Logit', 
                   'Normal', 
                   'Lognormal']

Then we gather the value of the final log likelihood for each model.

In [25]:
stats = {
    'Logit': stats_logit, 
    'Normal': stats_normal, 
    'Lognormal': stats_lognormal
}

In [26]:
loglike = {
    k: v['Final log likelihood'][0] 
    for k, v in stats.items()
}
loglike_row = pd.Series(data=loglike, 
                        name='Log likelihood')
summary = summary.append(loglike_row)
summary.fillna('')

Unnamed: 0,Logit,Normal,Lognormal
ASC_CAR,-0.261838,0.011932,0.054452
ASC_TRAIN,-0.451015,-0.103973,-0.067536
B_COST,-1.084663,-1.294047,-1.385676
B_FR,-5.35324,-6.371798,-5.969799
B_TIME,-1.276782,-2.27339,0.572966
B_TIME_S,,1.683225,1.245342
Log likelihood,-5315.386329,-5197.436873,-5215.046312
