In [1]:
import pandas as pd
import itertools
import pickle
from urllib.request import urlopen
import biogeme.database as db
import biogeme.biogeme as bio
import biogeme.models as models
import biogeme.results as res
from biogeme.expressions import Beta, log, exp

The following function obtains the data from the file available online.

In [2]:
url_root = (
    'https://courses.edx.org/'
    'asset-v1:EPFLx+ChoiceModels2x+3T2021+type@asset+block@'
)

In [3]:
def get_results_from_url(file):
    pickle_file = f'{url_root}{file}'

    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    return res.bioResults(data)

# Data

In [4]:
pandas = pd.read_table(f'{url_root}swissmetro.dat')
database = db.Database('swissmetro', pandas)

The following statement allows you to use the names of the variables as Python variable.

In [5]:
globals().update(database.variables)

We exclude some observations

In [6]:
exclude = ((PURPOSE != 1) * (PURPOSE != 3) + (CHOICE == 0)) > 0
database.remove(exclude)

# Parameters

In [7]:
ASC_CAR = Beta('ASC_CAR', 0, None, None, 0)
ASC_TRAIN = Beta('ASC_TRAIN', 0, None, None, 0)
B_TIME = Beta('B_TIME', 0, None, None, 0)
B_COST = Beta('B_COST', 0, None, None, 0)
B_FR = Beta('B_FR', 0, None, None, 0)

# Variables

In [8]:
SM_COST = SM_CO * (GA == 0)
TRAIN_COST = TRAIN_CO * (GA == 0)
CAR_AV_SP = CAR_AV * (SP != 0)
TRAIN_AV_SP = TRAIN_AV * (SP != 0)
TRAIN_TT_SCALED = TRAIN_TT / 100
TRAIN_COST_SCALED = TRAIN_COST / 100
SM_TT_SCALED = SM_TT / 100
SM_COST_SCALED = SM_COST / 100
CAR_TT_SCALED = CAR_TT / 100
CAR_CO_SCALED = CAR_CO / 100
TRAIN_HE_SCALED = TRAIN_HE / 1000
SM_HE_SCALED = SM_HE / 1000
LOW_INC = INCOME <= 1
BUSINESS = (PURPOSE == 3)

# Availability conditions

In [9]:
av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}

# Logit model

## Utility functions

In [10]:
V1 = (ASC_TRAIN +
      B_TIME * TRAIN_TT_SCALED +
      B_COST * TRAIN_COST_SCALED +
      B_FR * TRAIN_HE_SCALED)
V2 = (B_TIME * SM_TT_SCALED +
      B_COST * SM_COST_SCALED +
      B_FR * SM_HE_SCALED)
V3 = (ASC_CAR +
      B_TIME * CAR_TT_SCALED +
      B_COST * CAR_CO_SCALED)
V = {1: V1,
     2: V2,
     3: V3}

## Model

In [11]:
logprob = models.loglogit(V, av, CHOICE)

## Estimation

In [12]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = '01logit'
results_logit = biogeme.estimate()

## Results

In [13]:
stats_logit = results_logit.getGeneralStatistics()
print(results_logit.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-5315.386
Final log likelihood:	-5315.386
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.000941
Akaike Information Criterion:	10640.77
Bayesian Information Criterion:	10674.87
Final gradient norm:	8.1247E-03
Nbr of threads:	16



In [14]:
param_logit = results_logit.getEstimatedParameters()
param_logit

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,-0.261838,0.047307,-5.534875,3.114498e-08,0.061496,-4.257798,2.064506e-05
ASC_TRAIN,-0.451015,0.069678,-6.472835,9.618062e-11,0.09324,-4.837114,1.31738e-06
B_COST,-1.084663,0.051826,-20.929115,0.0,0.068235,-15.895902,0.0
B_FR,-5.35324,0.963865,-5.553932,2.793141e-08,0.983023,-5.44569,5.160495e-08
B_TIME,-1.276782,0.056938,-22.424014,0.0,0.104436,-12.225485,0.0


# Random parameter: normal distribution

Read the results from file

In [15]:
results_normal = get_results_from_url('rc-02normal.pickle')
param_normal = results_normal.getEstimatedParameters()
stats_normal = results_normal.getGeneralStatistics()

# Random parameter: lognormal distribution


Read the results from file

In [16]:
results_lognormal = get_results_from_url('rc-03lognormal.pickle')
param_lognormal = results_lognormal.getEstimatedParameters()
stats_lognormal = results_lognormal.getGeneralStatistics()

# Latent classes

Read the results from file

In [17]:
results_latent = get_results_from_url('rc-04latentClass.pickle')
param_latent = results_latent.getEstimatedParameters()
stats_latent = results_latent.getGeneralStatistics()

# Latent classes with class membership model

We consider again two classes in the population. The first class of individuals have considered all variables when making their choice. For them, the specification of the utility function is the same as for the logit model. 

In [18]:
V1_1 = (ASC_TRAIN +
        B_TIME * TRAIN_TT_SCALED +
        B_COST * TRAIN_COST_SCALED +
        B_FR * TRAIN_HE_SCALED)
V2_1 = (B_TIME * SM_TT_SCALED +
        B_COST * SM_COST_SCALED +
        B_FR * SM_HE_SCALED)
V3_1 = (ASC_CAR +
        B_TIME * CAR_TT_SCALED +
        B_COST * CAR_CO_SCALED)
V_1 = {1: V1_1,
       2: V2_1,
       3: V3_1}

The second class of individuals ignored the travel time variable when making the choice. Therefore, this variable is removed from the utility function.

In [19]:
V1_2 = (ASC_TRAIN +
        B_COST * TRAIN_COST_SCALED +
        B_FR * TRAIN_HE_SCALED)
V2_2 = (B_COST * SM_COST_SCALED +
        B_FR * SM_HE_SCALED)
V3_2 = (ASC_CAR +
        B_COST * CAR_CO_SCALED)
V_2 = {1: V1_2,
       2: V2_2,
       3: V3_2}

The following parameters are involved in the class membership model.

In [20]:
G_INTERCEPT = Beta('G_INTERCEPT', 0, None, None, 0)
G_MALE = Beta('G_MALE', 0, None, None, 0)
G_GA = Beta('G_GA', 0, None, None, 0)
G_BUSINESS = Beta('G_BUSINESS', 0, None, None, 0)
G_LOW_INC = Beta('G_LOW_INC', 0, None, None, 0)
G_FIRST = Beta('G_FIRST', 0, None, None, 0)

The following function returns the expressions for the class membership probabilities. If `value` is set to `True`, the values instead of the expressions are returned.

Note that `W` can potentially take any real value. We have to transform it into a probability using the transform `1 / (1 + exp(W))`

In [21]:
 def omega(MALE, GA, BUSINESS, LOW_INC, FIRST, value=False):
    W = (G_INTERCEPT +
         G_MALE * MALE +
         G_GA * GA +
         G_BUSINESS * BUSINESS +
         G_LOW_INC * LOW_INC +
         G_FIRST * FIRST)
    W_logit = {1: W, 2: 0}
    OMEGA_1 = 1 / (1 + exp(W))
    OMEGA_2 = exp(W) / (1 + exp(W))
    if value:
        return OMEGA_1.getValue(), OMEGA_2.getValue()
    return OMEGA_1, OMEGA_2

In [22]:
prob_class_1, prob_class_2 = omega(MALE, GA, BUSINESS, LOW_INC, FIRST)

## Model

We first calculate the choice probability for each class.

In [23]:
prob_1 = models.logit(V_1, av, CHOICE)
prob_2 = models.logit(V_2, av, CHOICE)

The choice probability is obtained by using the class membership model.

In [24]:
prob = prob_class_1 * prob_1 + prob_class_2 * prob_2
logprob = log(prob)

## Estimation

In [25]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = '05latentClass'
results_latentsocio = biogeme.estimate()
print(f'Results saved in file {results_latentsocio.data.pickleFileName}')

Results saved in file 05latentClass~01.pickle


## Results

In [26]:
stats_latentsocio = results_latentsocio.getGeneralStatistics()
print(results_latentsocio.printGeneralStatistics())

Number of estimated parameters:	11
Sample size:	6768
Excluded observations:	3960
Init log likelihood:	-4928.798
Final log likelihood:	-4928.798
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00223
Akaike Information Criterion:	9879.595
Bayesian Information Criterion:	9954.615
Final gradient norm:	2.6227E-02
Nbr of threads:	16



In [27]:
param_latentsocio = results_latentsocio.getEstimatedParameters()
param_latentsocio

Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
ASC_CAR,0.026467,0.052503,0.504109,0.6141847,0.055776,0.474522,0.6351278
ASC_TRAIN,-0.189438,0.076345,-2.481334,0.01308917,0.080963,-2.339798,0.01929415
B_COST,-1.451455,0.063687,-22.790603,0.0,0.095269,-15.235395,0.0
B_FR,-6.298512,1.052949,-5.981783,2.207075e-09,1.037793,-6.069138,1.285984e-09
B_TIME,-2.969263,0.146064,-20.328519,0.0,0.153037,-19.402202,0.0
G_BUSINESS,1.468889,0.284945,5.154991,2.536435e-07,0.361635,4.061798,4.869613e-05
G_FIRST,-0.988795,0.193323,-5.114731,3.141887e-07,0.208682,-4.738278,2.155415e-06
G_GA,4.210703,0.518003,8.128728,4.440892e-16,0.497814,8.458387,0.0
G_INTERCEPT,-1.523706,0.301364,-5.056026,4.280832e-07,0.365904,-4.164227,3.124088e-05
G_LOW_INC,0.334157,0.223943,1.492151,0.1356595,0.230036,1.452629,0.1463269


# Membership probability

We use the `itertools.product` function to enumerate all the combinations of values of the binary variables. We also use the `omega` function defined above, that returns the class membership probabilities.

In [28]:
simulation = pd.DataFrame()
vars = [' MALE', ' GA', ' BUSINESS', ' LOW_INC', ' FIRST']
for x in itertools.product([0, 1], [0, 1], [0, 1], [0, 1], [0, 1]):
    prob = omega(*x, value=True)
    prob_dict = {f'Class {i+1}': v for i, v in enumerate(prob)}
    vars_dict = dict(zip(vars, x))
    row = {**prob_dict, **vars_dict}
    simulation = simulation.append(row, ignore_index=True)
for i in range(2):
    key = f'Class {i+1}'
    simulation[key] = simulation[key].apply(lambda x: f'{100*x:.1f}%')

In [29]:
simulation

Unnamed: 0,Class 1,Class 2,MALE,GA,BUSINESS,LOW_INC,FIRST
0,82.1%,17.9%,0.0,0.0,0.0,0.0,0.0
1,92.5%,7.5%,0.0,0.0,0.0,0.0,1.0
2,76.7%,23.3%,0.0,0.0,0.0,1.0,0.0
3,89.8%,10.2%,0.0,0.0,0.0,1.0,1.0
4,51.4%,48.6%,0.0,0.0,1.0,0.0,0.0
5,74.0%,26.0%,0.0,0.0,1.0,0.0,1.0
6,43.1%,56.9%,0.0,0.0,1.0,1.0,0.0
7,67.0%,33.0%,0.0,0.0,1.0,1.0,1.0
8,6.4%,93.6%,0.0,1.0,0.0,0.0,0.0
9,15.5%,84.5%,0.0,1.0,0.0,0.0,1.0


# Comparison

We build a summary data frame. We first gather the parameter estimates for each model.

In [30]:
parameters = [
    param_logit, 
    param_normal, 
    param_lognormal, 
    param_latent, 
    param_latentsocio, 
]
parameters_values = [
    pd.DataFrame(df['Value']) 
    for df in parameters
]
summary = pd.concat(parameters_values, 
                    axis='columns')
summary.columns = [
    'Logit', 
    'Normal', 
    'Lognormal',
    'Latent', 
    'Latent with class mbship', 
]

Then we gather the value of the final log likelihood for each model.

In [31]:
stats = {
    'Logit': stats_logit, 
    'Normal': stats_normal, 
    'Lognormal': stats_lognormal, 
    'Latent': stats_latent, 
    'Latent with class mbship': stats_latentsocio, 
}

In [32]:
loglike = {
    k: v['Final log likelihood'][0] 
    for k, v in stats.items()
}
loglike_row = pd.Series(data=loglike, 
                        name='Log likelihood')
summary = summary.append(loglike_row)
summary.fillna('')

Unnamed: 0,Logit,Normal,Lognormal,Latent,Latent with class mbship
ASC_CAR,-0.261838,0.011932,0.054452,0.00295,0.026467
ASC_TRAIN,-0.451015,-0.103973,-0.067536,-0.108823,-0.189438
B_COST,-1.084663,-1.294047,-1.385676,-1.269444,-1.451455
B_FR,-5.35324,-6.371798,-5.969799,-6.119264,-6.298512
B_TIME,-1.276782,-2.27339,0.572966,-2.806484,-2.969263
B_TIME_S,,1.683225,1.245342,,
OMEGA,,,,0.748568,
G_BUSINESS,,,,,1.468889
G_FIRST,,,,,-0.988795
G_GA,,,,,4.210703
