In [1]:
import pandas as pd
import pickle
from urllib.request import urlopen
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme import models
import biogeme.results as res
import biogeme.optimization as opt
import biogeme.messaging as msg
from biogeme.expressions import (
    Beta,
    bioDraws,
    PanelLikelihoodTrajectory,
    MonteCarlo,
    log,
)


The estimation of these models may take a while. It you want to run the notebook using pre-estimated results saved on file, set the number of draws below to `None`

In [2]:
#numberOfDraws = 10000
numberOfDraws = None

In [3]:
def get_results_from_url(file):
    pickle_file = f'{url_root}{file}'

    with urlopen(pickle_file) as p:
        data = pickle.load(p)
    return res.bioResults(data)

In [4]:
url_root = (
    'https://courses.edx.org/'
    'asset-v1:EPFLx+ChoiceModels2x+3T2021+type@asset+block@'
)

# Dynamic Choice Models

We analyze the smoking behavior of individuals, as a function of their age and the price of tobacco using synthetic data. We develop a model that predicts, for every year, the probability to smoke or not. 

## Data

We use synthetic data that has been generated as follows. We postulate a true model for the data generation process. It is a mixture of logit models  with
two alternatives: ``smoking`` or ``not smoking``
. The utility for individual $n$ associated with "not smoking" in year $t$ is  
\begin{equation}
U_{0nt}= \varepsilon_{0nt}
\end{equation}
 and the utility associated with "smoking" is 
\begin{equation}
U_{1nt}= \beta_{nt} y_{n,t-1} + \beta^p_{nt} P_{t} + c_n + \varepsilon_{1nt},
\end{equation}
where 

- $\beta_{nt} = 10$,

- $y_{n,t-1}=1$ if $n$ is smoking at time $t-1$, $0$ otherwise,

- $\beta^p_{nt} = -0.1$,

- $P_t$ is the price of cigarets at time $t$,

- $c_n$ is an individual specific constant that captures the a priori, intrinsic attraction of each individual towards smoking. It is assumed to be normally distributed in the population, with zero mean and standard deviation 50: $N(0, 50^2)$, and constant over $t$.

We generate a sample of 1000 individuals, and we simulate their smoking behavior between the age of 16 until the age of 100.

The date of birth of each individual is uniformaly distributed between 2000 and 2020.
The price of cigarets in 2000 is supposed to be 10. The price of cigarets in year $t$ is $$P_t = 10 \cdot 1.02^{t-2000},$$
which represents a price increase of 2% per year.



In [5]:
logger = msg.bioMessage()
logger.setGeneral()

## True value of the parameters

We store the true value of the parameters for future comparison

In [6]:
trueParameters = pd.DataFrame(
    {'Value': [
        -0.1, 
        10,
        0,
        50
    ]
    }, 
    index=[
        'coef_price', 
        'beta_last_year',
        'cte_mean',
        'cte_std'
    ]
)

## Data

The observations are available in the following data file.

In [7]:
df = pd.read_table(f'{url_root}smoking.dat', sep=',')
df

Unnamed: 0,Age,Price,Smoking,LastYear,Id,Smoking45
0,17,14.859474,1,0,0,1
1,18,15.156663,1,1,0,1
2,19,15.459797,1,1,0,1
3,20,15.768993,1,1,0,1
4,21,16.084372,1,1,0,1
...,...,...,...,...,...,...
82995,95,69.633277,1,1,999,1
82996,96,71.025942,1,1,999,1
82997,97,72.446461,1,1,999,1
82998,98,73.895390,1,1,999,1


The data contains the following columns:

- the age of the individual,
- the price of the cigarets,
- a variable that is 1 if the individual is smoking, 0 otherwise,
- a variable that is 1 if the individual was smoking last year, 0 otherwise,
- a unique id for each individual,
- a variable that is 1 if the individual was smoking at the age of 45, in the beginning of the observation period.

In [8]:
database = db.Database('smoking', df)
globals().update(database.variables)

## Estimation procedure

The following procedure estimates the choice model (or read the estimation reszults from file), and returns the estimated parameters in a Pandas format. 

In [9]:
def estimate(the_logprob, the_name, the_database):
    if numberOfDraws is None:
        pickle_file = f'{the_name}.pickle'
        results = get_results_from_url(pickle_file)
    else:
        biogeme = bio.BIOGEME(
            the_database,
            the_logprob,
            numberOfDraws=numberOfDraws,
            removeUnusedVariables=False,
        )
        biogeme.modelName = the_name
        results = biogeme.estimate(algorithm=opt.bioBfgs)
    print(results.printGeneralStatistics())
    pandas_results = results.getEstimatedParameters()
    return pandas_results


## Static model

The static model considers the data as cross-sectional. No state dependance, and no serial correlation is captured.

In [10]:
cte_mean = Beta('cte_mean', 0, None, None, 0)
coef_price = Beta('coef_price', 0, None, None, 0)

In [11]:
V_s = coef_price * Price + cte_mean
V_ns = 0
V = {0: V_ns, 1: V_s}
logprob = models.loglogit(V, None, Smoking)

In [12]:
%%time
r_static = estimate(logprob, 
                    'static_model',
                    database)
r_static

Number of estimated parameters:	2
Sample size:	83000
Excluded observations:	0
Init log likelihood:	-57531.22
Final log likelihood:	-57529.39
Likelihood ratio test for the init. model:	3.653632
Rho-square for the init. model:	3.18e-05
Rho-square-bar for the init. model:	-3.01e-06
Akaike Information Criterion:	115062.8
Bayesian Information Criterion:	115081.4
Final gradient norm:	2.1735E-01
Nbr of threads:	36

CPU times: user 16.9 ms, sys: 2.39 ms, total: 19.3 ms
Wall time: 505 ms


Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
coef_price,-0.000101,0.000337,-0.299416,0.764623,0.000337,-0.299398,0.764636
cte_mean,0.017433,0.015998,1.089679,0.275855,0.015999,1.089618,0.275881


## Comparison of the estimates

In [13]:
summary = pd.concat(
    [trueParameters['Value'], 
     r_static['Value']], 
    axis='columns')
summary.columns = ['True', 'Static']
summary.fillna('')

Unnamed: 0,True,Static
coef_price,-0.1,-0.000101
beta_last_year,10.0,
cte_mean,0.0,0.017433
cte_std,50.0,


The estimated price coefficient is not significant. Indeed, price is the only variable that the model considers. Ignoring state dependence generates endogeneity. The model "thinks" that individuals are insensitive to price, as they choose an alternative that is expensive.  

## Dynamic model

The dynamic model adds the choice of last year as an explanatory variable

In [14]:
beta_last_year = Beta('beta_last_year', 0, None, None, 0)

In [15]:
V_s = beta_last_year * LastYear + coef_price * Price + cte_mean
V_ns = 0
V = {0: V_ns, 1: V_s}
logprob = models.loglogit(V, None, Smoking)

In [16]:
%%time
r_dynamic = estimate(logprob, 
                     'dynamic_model',
                     database)
r_dynamic

Number of estimated parameters:	3
Sample size:	83000
Excluded observations:	0
Init log likelihood:	-57531.22
Final log likelihood:	-2226.1
Likelihood ratio test for the init. model:	110610.2
Rho-square for the init. model:	0.961
Rho-square-bar for the init. model:	0.961
Akaike Information Criterion:	4458.201
Bayesian Information Criterion:	4486.181
Final gradient norm:	2.5151E-01
Nbr of threads:	36

CPU times: user 17.8 ms, sys: 2.21 ms, total: 20 ms
Wall time: 504 ms


Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
beta_last_year,24.985031,0.831062,30.063965,0.0,1.86397,13.404199,0.0
coef_price,-0.238493,0.010535,-22.637177,0.0,0.022272,-10.708417,0.0
cte_mean,1.509947,0.208223,7.251594,4.118927e-13,0.440293,3.429415,0.000605


### Comparison of the estimates

In [17]:
summary = pd.concat(
    [
        trueParameters['Value'], 
        r_static['Value'], 
        r_dynamic['Value']
    ], 
    axis='columns'
)
summary.columns = [
    'True', 
    'Static', 
    'Dynamic'
]
summary.fillna('')

Unnamed: 0,True,Static,Dynamic
coef_price,-0.1,-0.000101,-0.238493
beta_last_year,10.0,,24.985031
cte_mean,0.0,0.017433,1.509947
cte_std,50.0,,


The introduction of the lag variable has increased a lot the final log likelihood from `-57529.39` to `-57529.39`. Note that the error term in the model is not the same as in the true model. Indeed, serial correlation has been ignored. Therefore, the coefficients cannot be directly compared. But they ratio can be compared, as it cancels the scale. And it can be seen that the ratios are almost the same. 

In [18]:
(
    summary.loc['coef_price', 'True'] / 
    summary.loc['beta_last_year', 'True']
) 

-0.01

In [19]:
(
    summary.loc['coef_price', 'Dynamic'] / 
    summary.loc['beta_last_year', 'Dynamic'] 
)

-0.00954543005555873

## Static model with serial correlation

We now introduce the agent effect to capture serial correlation. First, we tell Biogeme that the data is organized as a panel, meaning that there are several observations corresponding to the same individuals.
Therefore, instead of considering that there is a sample of 11000 independent observations, Biogeme knows that there is actually a sample of 1000 individuals, for which a trajectory is observed.

Samle size ignoring the panal nature of the data

In [20]:
database.getSampleSize()

83000

Declaring the panel nature of the data

In [21]:
database.panel('Id')
database.getSampleSize()

1000

In [22]:
cte_std = Beta('cte_std', 1, None, None, 0)
cte = cte_mean + cte_std * bioDraws('agent', 'NORMAL_ANTI')

In [23]:
V_s = coef_price * Price + cte
V_ns = 0
V = {0: V_ns, 1: V_s}
obsprob = models.logit(V, None, Smoking)
condprobIndiv = PanelLikelihoodTrajectory(obsprob)
logprob = log(MonteCarlo(condprobIndiv))

In [24]:
%%time
r_serial_static = estimate(logprob, 
                           'static_model_serial',
                           database)
r_serial_static

Number of estimated parameters:	3
Sample size:	1000
Observations:	83000
Excluded observations:	0
Init log likelihood:	-2138.128
Final log likelihood:	-2011
Likelihood ratio test for the init. model:	254.2556
Rho-square for the init. model:	0.0595
Rho-square-bar for the init. model:	0.0581
Akaike Information Criterion:	4028
Bayesian Information Criterion:	4042.723
Final gradient norm:	5.6591E-01
Number of draws:	10000
Draws generation time:	0:00:12.859917
Types of draws:	['agent: NORMAL_ANTI']
Nbr of threads:	36

CPU times: user 19.7 ms, sys: 2.22 ms, total: 21.9 ms
Wall time: 514 ms


Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
coef_price,-0.042236,-0.0,1.797693e+308,0.0,0.0,1.797693e+308,0.0
cte_mean,0.383475,-0.0,1.797693e+308,0.0,0.0,1.797693e+308,0.0
cte_std,100.016857,-0.0,1.797693e+308,0.0,0.0,1.797693e+308,0.0


It appears that there were some numerical issues during the estimation of the parameter. It is certainly due to the high value of the `cte_std` parameter. The values of the standard errors and the $t$-test are not valid.

## Dynamic model with serial correlation

We now introduce the state dependence in the model, to make it dynamic.

In [25]:
V_s = beta_last_year * LastYear + coef_price * Price + cte
V_ns = 0
V = {0: V_ns, 1: V_s}
obsprob = models.logit(V, None, Smoking)
condprobIndiv = PanelLikelihoodTrajectory(obsprob)
logprob = log(MonteCarlo(condprobIndiv))

In [26]:
%%time
r_serial_dynamic = estimate(logprob, 
                            'dynamic_model_serial',
                            database)
r_serial_dynamic

Number of estimated parameters:	4
Sample size:	1000
Observations:	83000
Excluded observations:	0
Init log likelihood:	-2137.947
Final log likelihood:	-1057.267
Likelihood ratio test for the init. model:	2161.36
Rho-square for the init. model:	0.505
Rho-square-bar for the init. model:	0.504
Akaike Information Criterion:	2122.534
Bayesian Information Criterion:	2142.165
Final gradient norm:	6.2756E-03
Number of draws:	10000
Draws generation time:	0:00:13.051239
Types of draws:	['agent: NORMAL_ANTI']
Nbr of threads:	36

CPU times: user 19.9 ms, sys: 2.1 ms, total: 22 ms
Wall time: 525 ms


Unnamed: 0,Value,Std err,t-test,p-value,Rob. Std err,Rob. t-test,Rob. p-value
beta_last_year,10.221223,0.659238,15.504603,0.0,0.527755,19.36736,0.0
coef_price,-0.108952,0.013032,-8.360177,0.0,0.012339,-8.829663,0.0
cte_mean,-1.807713,5.152594,-0.350836,0.7257117,15.873518,-0.113882,0.9093311
cte_std,48.021917,7.788851,6.165469,7.027474e-10,8.263847,5.811085,6.206917e-09


### Comparison of the estimates

In [27]:
summary = pd.concat(
    [
        trueParameters['Value'], 
        r_static['Value'], 
        r_dynamic['Value'], 
        r_serial_static['Value'],
        r_serial_dynamic['Value'], 

    ], 
    axis='columns'
)
summary.columns = [
    'True', 
    'Static', 
    'Dynamic', 
    'Static + serial', 
    'Dynamic + serial'
]
summary.fillna('')

Unnamed: 0,True,Static,Dynamic,Static + serial,Dynamic + serial
coef_price,-0.1,-0.000101,-0.238493,-0.042236,-0.108952
beta_last_year,10.0,,24.985031,,10.221223
cte_mean,0.0,0.017433,1.509947,0.383475,-1.807713
cte_std,50.0,,,100.016857,48.021917


If the number of draws is sufficiently high, we observe that the parameters are quite well recovered. We can actually test the hypothesis that they are equal to they true value, using a $t$-test.

In [28]:
def t_test(param, true_value):
    return (
        (r_serial_dynamic.loc[param, 'Value'] - true_value) /
        r_serial_dynamic.loc[param, 'Rob. Std err']
    )    

In [29]:
t_test('coef_price', -0.1)

-0.7255004069003811

In [30]:
t_test('beta_last_year', 10)

0.4191771984284349

In [31]:
t_test('cte_mean', 0)

-0.11388231472320963

In [32]:
t_test('cte_std', 50)

-0.23936594294180186

Each of this $t$-test is sufficiently low so that the hypothesis cannot be rejected.