In [37]:
import pandas as pd
import matplotlib.pyplot as plt

import biogeme.database as db

import biogeme.biogeme as bio
from IPython.core.display_functions import display
from biogeme.expressions import Beta, Variable, log, exp
from biogeme.results import compileEstimationResults, compile_estimation_results
from biogeme.models import loglogit


# Mathematical Modelling of Behaviour - Group Project

**Group 9**

*Julien Marie F Ars, Chady Bensaid, Mohamed Amine Lazrak, Joshua Oyewole Oyebanji*

## 0. Initialisation

First, we import the data and input it in a python database

In [2]:
df = pd.read_csv("data/lpmc09.dat", sep="\t")
df

Unnamed: 0,trip_id,household_id,person_n,trip_n,travel_mode,purpose,fueltype,faretype,bus_scale,survey_year,...,dur_pt_access,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_ccharge,driving_traffic_percent
0,4,0,1,2,4,3,1,4,1.0,1,...,0.203056,0.000000,0.189444,0.000000,0,0.229167,1.50,0.78,0.0,0.130909
1,30,8,1,3,1,3,1,1,1.0,1,...,0.098889,0.000000,0.044444,0.000000,0,0.087500,1.50,0.20,0.0,0.292064
2,38,9,1,1,4,3,1,3,0.0,1,...,0.194722,0.000000,0.107778,0.000000,0,0.207778,0.00,0.49,0.0,0.467914
3,49,12,0,3,4,3,1,5,0.0,1,...,0.082500,0.000000,0.061944,0.000000,0,0.062500,0.00,0.20,0.0,0.124444
4,102,20,2,0,2,3,1,1,1.0,1,...,0.339722,0.183333,0.116667,0.266667,1,0.250833,3.00,0.89,0.0,0.170543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,81046,17605,0,2,3,3,1,5,0.0,3,...,0.160000,0.000000,0.391667,0.069722,1,0.314444,0.00,0.83,10.5,0.468198
4996,81048,17605,0,4,4,3,1,5,0.0,3,...,0.248056,0.000000,0.220833,0.000000,0,0.206667,0.00,0.55,0.0,0.205645
4997,81064,17608,0,0,3,1,6,1,1.0,3,...,0.123333,0.083333,0.116667,0.183333,2,0.374722,4.40,0.83,10.5,0.835434
4998,81067,17609,0,0,4,3,1,1,1.0,3,...,0.090278,0.000000,0.116667,0.000000,0,0.142500,1.50,0.42,0.0,0.087719


In [13]:
database = db.Database("london_passenger_mode_choice", df)

Let's explore a bit more the availlable columns, and create `Variable` instances at the same time :

In [21]:
for c in df.columns:
    print(c, df[c].count(), df[c].mean(), df[c].min(), df[c].max(), len(df[c].unique()))
    globals()[c] = Variable(c)

trip_id 5000 40862.7946 4 81079 5000
household_id 5000 8779.049 0 17613 4155
person_n 5000 0.8154 0 7 8
trip_n 5000 1.5228 0 15 15
travel_mode 5000 3.0344 1 4 4
purpose 5000 2.9076 1 5 5
fueltype 5000 2.8352 1 6 6
faretype 5000 2.2494 1 5 5
bus_scale 5000 0.6456 0.0 1.0 3
survey_year 5000 1.9922 1 3 3
travel_year 5000 2013.1904 2012 2015 4
travel_month 5000 6.7214 1 12 12
travel_date 5000 15.3074 1 31 31
day_of_week 5000 3.9066 1 7 7
start_time 5000 13.714623333305802 0.0 23.91666667 415
age 5000 39.4466 5 93 88
female 5000 0.5246 0 1 2
driving_license 5000 0.6178 0 1 2
car_ownership 5000 0.9784 0 2 3
distance 5000 4563.3578 99 32918 3661
dur_walking 5000 1.1194776666612 0.028611111 7.688333332999999 3656
dur_cycling 5000 0.3589819444438 0.0072222219999999 2.315 2375
dur_pt_access 5000 0.1600849444384 0.0 0.7605555559999999 1255
dur_pt_rail 5000 0.087053333335 0.0 1.116666667 64
dur_pt_bus 5000 0.1740006666632 0.0 1.65 1531
dur_pt_int 5000 0.044296333333999995 0.0 0.541388889 346
pt_in

Now, we compute the total travel time for public transport (`pt`) and the total driving cost.

In [23]:
dur_pt = dur_pt_rail + dur_pt_int + dur_pt_bus +  dur_pt_access
cost_driving = cost_driving_ccharge + cost_driving_fuel
print(dur_pt, cost_driving)


(((dur_pt_rail + dur_pt_int) + dur_pt_bus) + dur_pt_access) (cost_driving_ccharge + cost_driving_fuel)


We need also to check whether the `car_ownership` and `driving_license` variables are availlability conditions :

In [34]:
df[((df["travel_mode"] == 4) & (df["driving_license"] == 0) & (df["car_ownership"] == 0))][["travel_mode", "driving_license", "car_ownership"]]

Unnamed: 0,travel_mode,driving_license,car_ownership
44,4,0,0
103,4,0,0
169,4,0,0
295,4,0,0
296,4,0,0
...,...,...,...
4509,4,0,0
4675,4,0,0
4755,4,0,0
4847,4,0,0


Since there are people that still choose to go by driving whithout owning a car or holding a driving license, the variables are not considered to be availlability conditions

## 1. Model #0 Specification

We define our first model as :
$$ V_{pt} = \beta_{pt} + \beta_{cost} * \mathrm{cost\_transit} + \beta_{time} *  \mathrm{dur\_pt}$$
$$ V_{driving} = \beta_{driving} + \beta_{cost} *  \mathrm{cost\_driving} + \beta_{time} *  \mathrm{dur\_driving}$$
$$ V_{cycling} = \beta_{cycling} + \beta_{time} *  \mathrm{dur\_cycling} $$
$$ V_{walking} = \beta_{time} *  \mathrm{dur\_walking}$$

In [41]:
asc_pt = Beta("asc_pt", 0, None, None, 0)
asc_driving = Beta("asc_driving", 0, None, None, 0)
asc_cycling = Beta("asc_cycling", 0, None, None, 0)

beta_cost = Beta("beta_cost", 0, None, None, 0)
beta_time = Beta("beta_time", 0, None, None, 0)

V_pt = asc_pt + beta_cost * cost_transit + beta_time * dur_pt
V_driving = asc_driving + beta_cost * cost_driving + beta_time * dur_driving
V_cycling = asc_cycling + beta_time * dur_cycling
V_walking = beta_time * dur_walking # Note that asc_walking has been fixed to 0

V = {
    1: V_walking,
    2: V_cycling,
    3: V_pt,
    4 : V_driving
}

logprob = loglogit(V, av=None, i=travel_mode)

biogeme_0 = bio.BIOGEME(database, logprob)
biogeme_0.modelName = "MODEL_0"

results_0 = biogeme_0.estimate()
#results_0.print_general_statistics()
results_0.get_estimated_parameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycling,-3.794242,0.106195,-35.729152,0.0
asc_driving,-1.244754,0.078746,-15.807208,0.0
asc_pt,-0.529325,0.053172,-9.955005,0.0
beta_cost,-0.156193,0.013416,-11.642401,0.0
beta_time,-5.080437,0.195095,-26.040781,0.0


## 2. Model #1 Specification