In [1]:
import pandas as pd

# Read a space-delimited .dat file
file_path = 'D:/7995/lpmc.dat'  # Replace with your file path
df = pd.read_csv(file_path, delim_whitespace=True)  # Assume space-delimited


  df = pd.read_csv(file_path, delim_whitespace=True)  # Assume space-delimited


In [2]:
import numpy as np
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio

In [3]:
from biogeme import models
from biogeme.expressions import Beta

In [4]:
database = db.Database('london', df)

In [5]:
globals().update(database.variables)

In [6]:
list(df.columns)

['trip_id',
 'household_id',
 'person_n',
 'trip_n',
 'travel_mode',
 'purpose',
 'fueltype',
 'faretype',
 'bus_scale',
 'survey_year',
 'travel_year',
 'travel_month',
 'travel_date',
 'day_of_week',
 'start_time',
 'age',
 'female',
 'driving_license',
 'car_ownership',
 'distance',
 'dur_walking',
 'dur_cycling',
 'dur_pt_access',
 'dur_pt_rail',
 'dur_pt_bus',
 'dur_pt_int',
 'pt_interchanges',
 'dur_driving',
 'cost_transit',
 'cost_driving_fuel',
 'cost_driving_ccharge',
 'driving_traffic_percent']

In [7]:
# Parameters to be estimated
ASC_WALKING = Beta('ASC_WALKING', 0, None, None, 1)
ASC_CYCLING = Beta('ASC_CYCLING', 0, None, None, 0)
ASC_PT = Beta('ASC_PT', 0, None, None, 0)
ASC_DRIVING = Beta('ASC_DRIVING', 0, None, None, 0)
B_TIME_WALKING = Beta('B_TIME_WALKING', 0, None, None, 0)
B_TIME_CYCLING = Beta('B_TIME_CYCLING', 0, None, None, 0)
B_TIME_DRIVING = Beta('B_TIME_DRIVING', 0, None, None, 0)
B_COST_DRIVING = Beta('B_COST_DRIVING', 0, None, None, 0)
B_COST_PT = Beta('B_COST_PT', 0, None, None, 0)
B_TIME_PT_BUS = Beta('B_TIME_PT_BUS', 0, None, None, 0)
B_TIME_PT_RAIL = Beta('B_TIME_PT_RAIL', 0, None, None, 0)
B_TIME_PT_ACCESS = Beta('B_TIME_PT_ACCESS', 0, None, None, 0)
B_TIME_PT_INT = Beta('B_TIME_PT_INT_WAIT', 0, None, None, 0)
B_TRAFFIC_DRIVING = Beta('B_TRAFFIC_DRIVING', 0, None, None, 0)

# Utility functions

V1 = (
    ASC_WALKING + 
    B_TIME_WALKING * dur_walking
)

V2 = (
    ASC_CYCLING +
    B_TIME_CYCLING * dur_cycling
)

V3 = (
    ASC_PT +
    B_COST_PT * cost_transit + 
    B_TIME_PT_ACCESS * dur_pt_access + 
    B_TIME_PT_RAIL * dur_pt_rail + 
    B_TIME_PT_BUS * dur_pt_bus +
    B_TIME_PT_INT * dur_pt_int
)
      
V4 = (
    ASC_DRIVING +
    B_TIME_DRIVING * dur_driving +
    B_COST_DRIVING * (cost_driving_fuel + cost_driving_ccharge) +
    B_TRAFFIC_DRIVING * driving_traffic_percent
)
      
# Associate utility functions with the numbering of alternatives
V = {1: V1,
     2: V2,
     3: V3,
     4: V4}

# Associate the availability conditions with the alternatives

av = {1: 1,
      2: 1,
      3: 1,
      4: 1}

In [8]:
# Definition of the model. This is the contribution of each
# observation to the log likelihood function.
logprob = models.loglogit(V, av, travel_mode)

# Create the Biogeme object
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'lpmc_validation'

# Estimate the parameters
results = biogeme.estimate()

In [9]:
groups = 'household_id'
def split(slices):
    ids = df[groups].unique()
    np.random.shuffle(ids)
    the_slices_ids = np.array_split(ids, slices)
    theSlices = [
        df[df[groups].isin(ids)]
        for ids in the_slices_ids
    ]
    estimationSets = []
    validationSets = []
    for i, v in enumerate(theSlices):
        estimationSets.append(
            pd.concat(theSlices[:i] + theSlices[i + 1:])
        )
        validationSets.append(v)
    return zip(estimationSets, validationSets)

In [10]:
import pandas as pd
import numpy as np
from collections import namedtuple

# Define a named tuple to store estimation and validation sets
DatasetSplit = namedtuple('DatasetSplit', ['estimation', 'validation'])

def split(slices=5, groups='household_id'):
    ids = df[groups].unique()  # Get unique household IDs or group IDs
    np.random.shuffle(ids)  # Shuffle them randomly
    the_slices_ids = np.array_split(ids, slices)  # Split into N parts
    theSlices = [
        df[df[groups].isin(ids)]  # Get the subset of the dataframe based on the slice of IDs
        for ids in the_slices_ids
    ]
    
    estimationSets = []
    validationSets = []
    
    # For each slice, define the estimation and validation sets
    for i, v in enumerate(theSlices):
        estimationSet = pd.concat(theSlices[:i] + theSlices[i + 1:])  # All data except slice i
        validationSet = v  # Slice i
        estimationSets.append(DatasetSplit(estimation=estimationSet, validation=validationSet))
    
    return estimationSets

# Example usage of split function
validationData = split(slices=5)

# Assuming biogeme has been defined and initialized earlier, and 'results' is the model estimation result
validation_results = biogeme.validate(results, validationData)

for slide in validation_results:
    print(
        f'Log likelihood for {slide.shape[0]} validation data: '
        f'{slide["Loglikelihood"].mean()}'
    )


Log likelihood for 16537 validation data: -0.8387741250663399
Log likelihood for 16435 validation data: -0.8397768599444548
Log likelihood for 15823 validation data: -0.8455486705776686
Log likelihood for 16123 validation data: -0.8379846052923369
Log likelihood for 16168 validation data: -0.8295852237652763


In [11]:
# Get the summary of the results
summary = results.getEstimatedParameters()

# Display the results
print(summary)

# You can also print specific metrics like log-likelihood, number of iterations, etc.
print(f'Log-likelihood: {results.data.logLike:.2f}')
print(f'Number of iterations: {results.data.niter}')

                       Value  Rob. Std err  Rob. t-test  Rob. p-value
ASC_CYCLING        -4.737083      0.050536   -93.736195           0.0
ASC_DRIVING        -1.429229      0.035376   -40.400942           0.0
ASC_PT             -2.305768      0.036500   -63.172545           0.0
B_COST_DRIVING     -0.100153      0.004214   -23.764264           0.0
B_COST_PT          -0.185923      0.008029   -23.156057           0.0
B_TIME_CYCLING     -4.936345      0.129450   -38.133235           0.0
B_TIME_DRIVING     -4.231305      0.123544   -34.249504           0.0
B_TIME_PT_ACCESS   -4.823427      0.112365   -42.926574           0.0
B_TIME_PT_BUS      -1.991148      0.071800   -27.731694           0.0
B_TIME_PT_INT_WAIT -4.344064      0.176279   -24.643055           0.0
B_TIME_PT_RAIL     -1.660247      0.132316   -12.547556           0.0
B_TIME_WALKING     -8.222140      0.104374   -78.775747           0.0
B_TRAFFIC_DRIVING  -3.020438      0.058185   -51.911110           0.0
Log-likelihood: -679

AttributeError: 'rawResults' object has no attribute 'niter'

In [12]:
# Compute null log-likelihood
null_loglikelihood = biogeme.calculateNullLoglikelihood(av)
print(f"Null log-likelihood: {null_loglikelihood}")

# Estimate the model
results = biogeme.estimate()

# Extract the model log-likelihood
model_loglikelihood = results.data.logLike
print(f"Model log-likelihood: {model_loglikelihood}")

# Degrees of freedom (number of estimated parameters)
num_parameters = len(results.getEstimatedParameters())

# Likelihood Ratio Test
from scipy.stats import chi2

lr_test_statistic = -2 * (null_loglikelihood - model_loglikelihood)
p_value = chi2.sf(lr_test_statistic, num_parameters)

print(f"Likelihood Ratio Test Statistic: {lr_test_statistic}")
print(f"P-value: {p_value}")

# Adjusted Rho-Square
adjusted_rho_square = 1 - (model_loglikelihood - num_parameters) / null_loglikelihood
print(f"Adjusted Rho-Square: {adjusted_rho_square}")


Null log-likelihood: -112409.06456558515
Model log-likelihood: -67929.36181464508
Likelihood Ratio Test Statistic: 88959.40550188016
P-value: 0.0
Adjusted Rho-Square: 0.3955793327058241


In [13]:
# Compute the Chi-square statistic
chi_square = -2 * (null_loglikelihood - model_loglikelihood)

# Degrees of freedom (number of estimated parameters)
degrees_of_freedom = len(results.getEstimatedParameters())  # Number of parameters estimated

# Print the results
print(f"Chi-Square: {chi_square}")
print(f"Degrees of Freedom: {degrees_of_freedom}")


Chi-Square: 88959.40550188016
Degrees of Freedom: 13
