# An introduction to Biogeme

## Biogeme Basics: Logit Model

In [None]:
import pandas  as pd
import numpy as np
import biogeme.database  as db
import biogeme.biogeme  as bio
import matplotlib.pyplot as plt

**Import Swissmetro data**

In [None]:
pandas = pd.read_csv("data/swissmetro.dat",sep='\t')
database = db.Database("data/swissmetro", pandas)

## Let's see what this dataset has

* dataset consists of survey data collected on the trains between St. Gallen and Geneva, Switzerland, during March 1998
* It is necessary to obtain data from surveys of hypothetical markets/situations, which include the innovation, to assess the impact. 
* Survey data were collected on rail-based travels, interviewing 470 respondents. Due to data problems, only 441 are used here. A similar method for relevant car trips. A total of 1070 persons filled in the survey completely and were willing to participate in the second SP survey, which was generated using the same approach used for the rail interviews. 750 usable SP surveys were returned, from the license-plate based survey.
* Nine stated choice situations were generated for each the respondents, offering three alternatives: rail, Swissmetro and car

Bierlaire, M., Axhausen, K. and Abay, G. (2001), The acceptance of modal innovation: The case of Swissmetro, in ‘Proceedings of the Swiss Transport Research Conference’, Ascona, Switzerland.

![](img/swissmetro_var1.png)

![](img/swissmetro_var2.png)

In [None]:
binwidth = 1
data = pandas['PURPOSE']
plt.hist(data, bins=np.arange(min(data), max(data) + binwidth+1, binwidth));
plt.xticks(np.arange(9)+1, ('Commuter', 
                      'Shopping', 
                      'Business', 
                      'Leisure', 
                      'Return from work',
                      'Return from shopping', 
                      'Return from business',
                      'Return from leisure',
                      'Other'));
plt.xticks(rotation=90);

**Use collumn names as variables**

In [None]:
from headers import *

**Exclude some unwanted entries**

In [None]:
exclude = (( PURPOSE != 1 ) * ( PURPOSE != 3 ) + ( CHOICE == 0 )) > 0

database.remove(exclude)

**Define some dummy variables**

In [None]:
SM_COST = SM_CO * ( GA == 0 )
TRAIN_COST = TRAIN_CO * ( GA == 0 )

CAR_AV_SP = DefineVariable ('CAR_AV_SP', CAR_AV * ( SP !=0 ), database)
TRAIN_AV_SP = DefineVariable ('TRAIN_AV_SP', TRAIN_AV * ( SP != 0 ), database)

**Rescale some data**

In [None]:
TRAIN_TT_SCALED   = DefineVariable('TRAIN_TT_SCALED',   TRAIN_TT / 100.0, database)
TRAIN_COST_SCALED = DefineVariable('TRAIN_COST_SCALED', TRAIN_COST / 100, database)
SM_TT_SCALED      = DefineVariable('SM_TT_SCALED',      SM_TT / 100.0   , database)
SM_COST_SCALED    = DefineVariable('SM_COST_SCALED',    SM_COST / 100   , database)
CAR_TT_SCALED     = DefineVariable('CAR_TT_SCALED',     CAR_TT / 100    , database)
CAR_CO_SCALED     = DefineVariable('CAR_CO_SCALED',     CAR_CO / 100    , database)

In [None]:
pandas = database.data

In [None]:
binwidth = 1
data = pandas['PURPOSE']
plt.hist(data, bins=np.arange(min(data), max(data) + binwidth+1, binwidth));
plt.xticks(np.arange(9)+1, ('Commuter', 
                      'Shopping', 
                      'Business', 
                      'Leisure', 
                      'Return from work',
                      'Return from shopping', 
                      'Return from business',
                      'Return from leisure',
                      'Other'));
plt.xticks(rotation=90);

In [None]:
binwidth = 1
data = pandas['MALE']
plt.hist(data, bins=np.arange(min(data), max(data) + binwidth+1, binwidth));
plt.xticks(np.arange(2), ('Female', 
                      'Male'));
plt.xticks(rotation=90);

In [None]:
plt.figure(1)
plt.title('Train Availability')
data = pandas['TRAIN_AV']
plt.hist(data, bins=np.arange(min(data), max(data) + 1+1, 1));
plt.xticks(np.arange(2), ('No', 
                          'Yes'));
plt.xticks(rotation=90);

plt.figure(2)
plt.title('Car Availability')
data = pandas['CAR_AV']
plt.hist(data, bins=np.arange(min(data), max(data) + 1+1, 1));
plt.xticks(np.arange(2), ('No', 
                          'Yes'));
plt.xticks(rotation=90);

plt.figure(3)
plt.title('Swissmetro Availability')
data = pandas['SM_AV']
plt.hist(data, bins=np.arange(min(data), max(data) + 1+1, 1));
plt.xticks(np.arange(2), ('No', 
                          'Yes'));
plt.xticks(rotation=90);

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

pandas.describe()

**Define the utility functions**

\begin{align}
V_1 & = \beta_{Train} + \beta_{time}X_{Train_{TT}} + \beta_{cost}X_{Train_{cost}}\\
V_2 & = \beta_{SM} +  \beta_{time}X_{SM_{TT}} + \beta_{cost}X_{SM_{cost}}\\
V_3 & = \beta_{Car} + \beta_{time}X_{Car_{TT}} + \beta_{cost}X_{Car_{cost}}\\
\end{align}

**Create parameters to be estimated**

`Beta`
1. name of parameter
2. default value for the parameter
3. lower bound
4. upper bound
5. flag indicating if parameter is to be estimated

In [None]:
ASC_CAR = Beta('ASC_CAR',0,None ,None ,0)
ASC_TRAIN = Beta('ASC_TRAIN',0,None ,None ,0)
ASC_SM = Beta('ASC_SM',0,None ,None ,1)
B_TIME = Beta('B_TIME',0,None ,None ,0)
B_COST = Beta('B_COST',0,None ,None ,0)

**Define the utility functions**

In [None]:
V1 = ASC_TRAIN + \
     B_TIME * TRAIN_TT_SCALED + \
     B_COST * TRAIN_COST_SCALED
V2 = ASC_SM + \
     B_TIME * SM_TT_SCALED + \
     B_COST * SM_COST_SCALED
V3 = ASC_CAR + \
     B_TIME * CAR_TT_SCALED + \
     B_COST * CAR_CO_SCALED

**Associate utility functions with alternatives and associate availability of alternatives**

Create a python dictionary with all utility functions

Create a python dictionary with availability of choices

In [None]:
V = {1: V1,
     2: V2,
     3: V3}

av = {1: TRAIN_AV_SP,
      2: SM_AV,
      3: CAR_AV_SP}

**Define the model**

In [None]:
logprob = bioLogLogit(V,av,CHOICE)

**Define the Biogeme object**

* Give the database with all variables
* Give the log likelihood model

In [None]:
biogeme  = bio.BIOGEME(database, logprob)

biogeme.modelName = "swissmetro_logit_basic"

**Estimate the model**

1. A `.html` can be generated with a report of the results and can be opened with a browser
2. A `.pickle` file can also be generaetd with a snapshot with the results. This file can then be used in other scripts

In [None]:
biogeme.generateHtml = True
biogeme.generatePickle = False

results = biogeme.estimate()

print(f"HTML file:    {results.data.htmlFileName}")
print(f"Pickle file:  {results.data.pickleFileName }")

**Print results**

In [None]:
betas = results.getBetaValues()
for k,v in betas.items():
    print(f"{k:10}=\t{v:.3g}")

**Get the variance-covariance matrix**

In [None]:
results.getRobustVarCovar()

**Get the general statistics**

In [None]:
gs = results.getGeneralStatistics()

for k,v in gs.items():
    print("{}= {}".format(k.ljust(45),v[0]))

**Clean up output files**

In [None]:
import glob, os

if not os.path.exists('results'):
    os.makedirs('results')

result_files = glob.glob(biogeme.modelName+'*')
result_files = [x for x in result_files if x != biogeme.modelName+'.ipynb']
if len(result_files) != 0:
    result_dir = "results"
    print('Moving the following files:')
    for result_file in result_files:
        print('\t',result_file)
        os.rename(result_file, os.path.join(result_dir, result_file))