In [1]:
import pandas as pd
import pickle
from urllib.request import urlopen
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.results as res
from biogeme.expressions import Beta, log, exp, bioDraws, MonteCarlo

The estimation may take a significant amount of time. Therefore, this notebook can be executed using models that have been estimated. The results are read from file if the variable `numberOfDraws` below is set to `None`.

In [2]:
#numberOfDraws=10000
numberOfDraws = None

The following function obtains the data from the file available online.

In [3]:
url_root = (
    'https://courses.edx.org/'
    'asset-v1:EPFLx+ChoiceModels2x+3T2021+type@asset+block@'
)

In [4]:
def get_results_from_url(file):
    pickle_file = f'{url_root}{file}'

    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    return res.bioResults(data)

# Data

In [5]:
pandas = pd.read_table(f'{url_root}swissmetro.dat')
database = db.Database('swissmetro', pandas)

The following statement allows you to use the names of the variables as Python variable.

In [6]:
globals().update(database.variables)

We exclude some observations

In [7]:
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

# Parameters

In [8]:
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)
B_FR = Beta('B_FR', 0, None, None, 0)

# Variables

In [9]:
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100
TRAIN_COST_SCALED = TRAIN_COST / 100
SM_TT_SCALED = SM_TT / 100
SM_COST_SCALED = SM_COST / 100
CAR_TT_SCALED = CAR_TT / 100
CAR_CO_SCALED = CAR_CO / 100
TRAIN_HE_SCALED = TRAIN_HE / 1000
SM_HE_SCALED = SM_HE / 1000
LOW_INC = INCOME <= 1

# Availability conditions

In [10]:
av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}

# Logit model

## Utility functions

In [11]:
V1 = (ASC_TRAIN +
      B_TIME * TRAIN_TT_SCALED +
      B_COST * TRAIN_COST_SCALED +
      B_FR * TRAIN_HE_SCALED)
V2 = (B_TIME * SM_TT_SCALED +
      B_COST * SM_COST_SCALED +
      B_FR * SM_HE_SCALED)
V3 = (ASC_CAR +
      B_TIME * CAR_TT_SCALED +
      B_COST * CAR_CO_SCALED)
V = {1: V1,
     2: V2,
     3: V3}

## Model

In [12]:
logprob = models.loglogit(V, av, CHOICE)

## Estimation

In [13]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = '01logit'
results_logit = biogeme.estimate()

## Results

In [14]:
stats_logit = results_logit.getGeneralStatistics()
print(results_logit.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-5315.386
Final log likelihood:	-5315.386
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.000941
Akaike Information Criterion:	10640.77
Bayesian Information Criterion:	10674.87
Final gradient norm:	8.1247E-03
Nbr of threads:	16



In [15]:
param_logit = results_logit.getEstimatedParameters()
param_logit

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-0.261838,0.047307,-5.534875,3.114498e-08,0.061496,-4.257798,2.064506e-05
ASC_TRAIN,-0.451015,0.069678,-6.472835,9.618062e-11,0.09324,-4.837114,1.31738e-06
B_COST,-1.084663,0.051826,-20.929115,0.0,0.068235,-15.895902,0.0
B_FR,-5.35324,0.963865,-5.553932,2.793141e-08,0.983023,-5.44569,5.160495e-08
B_TIME,-1.276782,0.056938,-22.424014,0.0,0.104436,-12.225485,0.0


# Random parameter: normal distribution

Read the results from file

In [16]:
results_normal = get_results_from_url('rc-02normal.pickle')
param_normal = results_normal.getEstimatedParameters()
stats_normal = results_normal.getGeneralStatistics()

# Random parameter: lognormal distribution


Read the results from file

In [17]:
results_lognormal = get_results_from_url('rc-03lognormal.pickle')
param_lognormal = results_lognormal.getEstimatedParameters()
stats_lognormal = results_lognormal.getGeneralStatistics()

# Latent classes

Read the results from file

In [18]:
results_latent = get_results_from_url('rc-04latentClass.pickle')
param_latent = results_latent.getEstimatedParameters()
stats_latent = results_latent.getGeneralStatistics()

# Latent classes with class membership model

We consider again two classes in the population. The first class of individuals have considered all variables when making their choice. For them, the specification of the utility function is the same as for the logit model. 

In [19]:
V1_1 = (ASC_TRAIN +
        B_TIME * TRAIN_TT_SCALED +
        B_COST * TRAIN_COST_SCALED +
        B_FR * TRAIN_HE_SCALED)
V2_1 = (B_TIME * SM_TT_SCALED +
        B_COST * SM_COST_SCALED +
        B_FR * SM_HE_SCALED)
V3_1 = (ASC_CAR +
        B_TIME * CAR_TT_SCALED +
        B_COST * CAR_CO_SCALED)
V_1 = {1: V1_1,
       2: V2_1,
       3: V3_1}

The second class of individuals ignored the travel time variable when making the choice. Therefore, this variable is removed from the utility function.

In [20]:
V1_2 = (ASC_TRAIN +
        B_COST * TRAIN_COST_SCALED +
        B_FR * TRAIN_HE_SCALED)
V2_2 = (B_COST * SM_COST_SCALED +
        B_FR * SM_HE_SCALED)
V3_2 = (ASC_CAR +
        B_COST * CAR_CO_SCALED)
V_2 = {1: V1_2,
       2: V2_2,
       3: V3_2}

The following parameters are involved in the class membership model.

In [21]:
G_INTERCEPT = Beta('G_INTERCEPT', 0, None, None, 0)
G_MALE = Beta('G_MALE', 0, None, None, 0)
G_GA = Beta('G_GA', 0, None, None, 0)
G_PURP3 = Beta('G_PURP3', 0, None, None, 0)
G_LOW_INC = Beta('G_LOW_INC', 0, None, None, 0)
G_FIRST = Beta('G_FIRST', 0, None, None, 0)

Class membership model. Note that `OMEGA` can potentially take any real value. We have to transform it into a probability using the transform `1 / (1 + exp(OMEGA))`

In [22]:
OMEGA = (
    G_INTERCEPT +
    G_MALE * MALE +
    G_GA * GA +
    G_PURP3 * (PURPOSE == 3) +
    G_LOW_INC * LOW_INC +
    G_FIRST * FIRST
)
prob_class_1 = 1 / (1 + exp(OMEGA))
prob_class_2 = 1 - prob_class_1

## Model

We first calculate the choice probability for each class.

In [23]:
prob_1 = models.logit(V_1, av, CHOICE)
prob_2 = models.logit(V_2, av, CHOICE)

The choice probability is obtained by using the class membership model.

In [24]:
prob = prob_class_1 * prob_1 + prob_class_2 * prob_2
logprob = log(prob)

## Estimation

In [25]:
if numberOfDraws is None:
    results_latentsocio = get_results_from_url('rc-05latentClass.pickle')
else:
    biogeme = bio.BIOGEME(database, logprob)
    biogeme.modelName = '05latentClass'
    results_latentsocio = biogeme.estimate()
    print(f'Results saved in file {results_latentsocio.data.pickleFileName}')

## Results

In [26]:
stats_latentsocio = results_latentsocio.getGeneralStatistics()
print(results_latentsocio.printGeneralStatistics())

Number of estimated parameters:	11
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-6964.663
Final log likelihood:	-4928.798
Likelihood ratio test for the init. model:	4071.731
Rho-square for the init. model:	0.292
Rho-square-bar for the init. model:	0.291
Akaike Information Criterion:	9879.595
Bayesian Information Criterion:	9954.615
Final gradient norm:	1.7210E-02
Nbr of threads:	16



In [27]:
param_latentsocio = results_latentsocio.getEstimatedParameters()
param_latentsocio

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,0.026593,0.052506,0.506472,0.6125255,0.055788,0.47667,0.6335969
ASC_TRAIN,-0.19006,0.076349,-2.48935,0.01279768,0.080989,-2.34674,0.01893848
B_COST,-1.45144,0.063685,-22.790952,0.0,0.095266,-15.235734,0.0
B_FR,-6.287138,1.05284,-5.971601,2.349368e-09,1.037588,-6.059377,1.366495e-09
B_TIME,-2.968873,0.146068,-20.325338,0.0,0.153076,-19.394729,0.0
G_FIRST,-0.989434,0.193427,-5.115278,3.132794e-07,0.208886,-4.736722,2.172024e-06
G_GA,4.213248,0.519083,8.11671,4.440892e-16,0.499629,8.432755,0.0
G_INTERCEPT,-1.524404,0.301512,-5.055867,4.284402e-07,0.366282,-4.161834,3.157019e-05
G_LOW_INC,0.334647,0.223976,1.494123,0.1351434,0.230073,1.454527,0.1458004
G_MALE,-0.940101,0.199159,-4.720362,2.354249e-06,0.212399,-4.426113,9.594631e-06


# Comparison

We build a summary data frame. We first gather the parameter estimates for each model.

In [28]:
parameters = [
    param_logit, 
    param_normal, 
    param_lognormal, 
    param_latent, 
    param_latentsocio, 
]
parameters_values = [
    pd.DataFrame(df['Value']) 
    for df in parameters
]
summary = pd.concat(parameters_values, 
                    axis='columns')
summary.columns = [
    'Logit', 
    'Normal', 
    'Lognormal',
    'Latent', 
    'Latent with class mbship', 
]

Then we gather the value of the final log likelihood for each model.

In [29]:
stats = {
    'Logit': stats_logit, 
    'Normal': stats_normal, 
    'Lognormal': stats_lognormal, 
    'Latent': stats_latent, 
    'Latent with class mbship': stats_latentsocio, 
}

In [30]:
loglike = {
    k: v['Final log likelihood'][0] 
    for k, v in stats.items()
}
loglike_row = pd.Series(data=loglike, 
                        name='Log likelihood')
summary = summary.append(loglike_row)
summary.fillna('')

Unnamed: 0,Logit,Normal,Lognormal,Latent,Latent with class mbship
ASC_CAR,-0.261838,0.011932,0.054452,0.00295,0.026593
ASC_TRAIN,-0.451015,-0.103973,-0.067536,-0.108823,-0.19006
B_COST,-1.084663,-1.294047,-1.385676,-1.269444,-1.45144
B_FR,-5.35324,-6.371798,-5.969799,-6.119264,-6.287138
B_TIME,-1.276782,-2.27339,0.572966,-2.806484,-2.968873
B_TIME_S,,1.683225,1.245342,,
OMEGA,,,,0.748568,
G_FIRST,,,,,-0.989434
G_GA,,,,,4.213248
G_INTERCEPT,,,,,-1.524404
