# MNL推定を行うためのコード
準備したデータを用いてBiogemeで交通手段選択モデルを推定します．

In [114]:
# import library
import numpy as np
import pandas as pd

import biogeme.database as db
from biogeme.expressions import Variable

import biogeme.biogeme as bio
from biogeme import models
from biogeme.expressions import Beta

## データの読み込み&加工
準備したデータを読み込んで以下の加工を行います．
1. 着目する交通手段だけに限定したデータにする(今回は鉄道・バス・航空)．
2. Biogemeでは，ヘッダー行以外の文字は読み込めないので，当該列を削除する

In [117]:
df = pd.read_csv("../data/data.csv", encoding ='shift_jis')
df.head()

Unnamed: 0,O_code,O_name,D_code,D_name,purpose_code,purpose_name,mode_code,mode_name,sex_code,sex_name,...,car_time,bus_time,ship_time,rail_time,air_time,car_cost,bus_cost,ship_cost,rail_cost,air_cost
0,1,道北,48,道東,1,仕事,1,航空,1.0,Male,...,304.3508,515.8,,397.3,306.3,2900.0,5450.0,,13310.0,28740.0
1,1,道北,48,道東,1,仕事,1,航空,1.0,Male,...,304.3508,515.8,,397.3,306.3,2900.0,5450.0,,13310.0,28740.0
2,1,道北,48,道東,1,仕事,1,航空,1.0,Male,...,304.3508,515.8,,397.3,306.3,2900.0,5450.0,,13310.0,28740.0
3,1,道北,48,道東,1,仕事,2,鉄道,1.0,Male,...,304.3508,515.8,,397.3,306.3,2900.0,5450.0,,13310.0,28740.0
4,1,道北,48,道東,1,仕事,2,鉄道,1.0,Male,...,304.3508,515.8,,397.3,306.3,2900.0,5450.0,,13310.0,28740.0


In [118]:
df = df[(df['mode_name'] == "幹線バス") | (df['mode_name'] == "鉄道") | (df['mode_name'] == "乗用車等")].reset_index(drop=True).copy()
df = df.drop(columns=['O_name', 'D_name', 'purpose_name', 'mode_name', 'sex_name', 'ship_time', 'ship_cost', 'air_time', 'air_cost']).copy()

df.head()

Unnamed: 0,O_code,D_code,purpose_code,mode_code,sex_code,age_code,num,car_time,bus_time,rail_time,car_cost,bus_cost,rail_cost
0,1,48,1,2,1.0,30.0,2.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
1,1,48,1,2,1.0,40.0,2.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
2,1,48,1,2,2.0,30.0,3.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
3,1,48,3,2,2.0,40.0,1.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
4,1,48,1,4,1.0,20.0,1.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0


In [121]:
df = df.dropna().reset_index(drop=True).copy()
df.head()

Unnamed: 0,O_code,D_code,purpose_code,mode_code,sex_code,age_code,num,car_time,bus_time,rail_time,car_cost,bus_cost,rail_cost
0,1,48,1,2,1.0,30.0,2.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
1,1,48,1,2,1.0,40.0,2.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
2,1,48,1,2,2.0,30.0,3.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
3,1,48,3,2,2.0,40.0,1.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
4,1,48,1,4,1.0,20.0,1.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0


In [123]:
df = df4.loc[df4.index.repeat(df4['num'])].reset_index(drop=True).copy() 
df.head()

Unnamed: 0,O_code,D_code,purpose_code,mode_code,sex_code,age_code,num,car_time,bus_time,rail_time,car_cost,bus_cost,rail_cost
0,1,48,1,2,1.0,30.0,2.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
1,1,48,1,2,1.0,30.0,2.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
2,1,48,1,2,1.0,40.0,2.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
3,1,48,1,2,1.0,40.0,2.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0
4,1,48,1,2,2.0,30.0,3.0,304.3508,515.8,397.3,2900.0,5450.0,13310.0


## Biogemeによる推定
### データの読み込み

In [126]:
database = db.Database ('junryudou', df)

### 各列の数値の変数への変換

In [11]:
origin = Variable ('O_code')
destination = Variable ('D_code')
purpose = Variable ('purpose_code')
mode = Variable ('mode_code')
sex = Variable ('sex_code')
age = Variable ('age_code')
car_time = Variable ('car_time')
bus_time = Variable ('bus_time')
rail_time = Variable ('rail_time')
car_cost = Variable ('car_cost')
bus_cost = Variable ('bus_cost')
rail_cost = Variable ('rail_cost')

In [12]:
car_time_scaled = car_time / 10
car_cost_scaled = car_cost / 100
rail_time_scaled = rail_time / 10
rail_cost_scaled = rail_cost / 100
bus_time_scaled = bus_time / 10
bus_cost_scaled = bus_cost / 100

## パラメータの定義

In [14]:
# parameter = Beta('name', value, lowerBound, upperBound, status)

ASC_car = Beta('ASC_car', 0, None, None, 0)
ASC_rail = Beta('ASC_rail', 0, None, None, 0)
ASC_bus = Beta ('ASC_bus', 0, None, None, 1)  # 推定せずに固定する
B_time = Beta ('B_time', 0, None, None, 0)
B_cost = Beta ('B_cost', 0, None, None, 0)

## 効用関数の定義

In [16]:
V_car = ASC_car + B_time * car_time_scaled + B_cost * car_cost_scaled
V_rail = ASC_rail + B_time * rail_time_scaled + B_cost * rail_cost_scaled
V_bus = ASC_bus + B_time * bus_time_scaled + B_cost * bus_cost_scaled

## 選択結果との対応付け

In [18]:
# 5: 自家用車，2: 鉄道，4: 幹線バス
V = {5: V_car, 2: V_rail, 4: V_bus }

## 選択可能性の設定

In [20]:
av = {5: 1, 2: 1, 4: 1}

## モデルの定義と推定

In [22]:
logprob = models.loglogit(V , av , mode)
the_biogeme = bio.BIOGEME(database, logprob)  # the_biogemeはBIOGEME型のクラスをもつインスタンス
the_biogeme.modelName = 'mode_MNL_scaled'
the_biogeme.calculate_null_loglikelihood(av)

File biogeme.toml has been created
  the_biogeme.modelName = 'mode_MNL_scaled'


-2435276.28249398

In [23]:
MNL_results = the_biogeme.estimate()  # BIOGEME型のクラスのthe_biogemeインスタンスが有するestimateメソッドを呼び出している

## 推定結果の出力

In [25]:
print(MNL_results.short_summary())

Results for model mode_MNL_scaled
Nbr of parameters:		4
Sample size:			2216684
Excluded data:			0
Null log likelihood:		-2435276
Final log likelihood:		-1111394
Likelihood ratio test (null):		2647765
Rho square (null):			0.544
Rho bar square (null):			0.544
Akaike Information Criterion:	2222795
Bayesian Information Criterion:	2222846



### 結果取得用のライブラリをインポート

In [None]:
from biogeme.results_processing.estimation_results import EstimationResults
from biogeme.results_processing.variance_covariance import EstimateVarianceCovariance

### 推定結果を使いやすいクラスに渡す

In [None]:
# MNL_results が RawEstimationResults の場合に EstimationResults でラップ
res = MNL_results if isinstance(MNL_results, EstimationResults) else EstimationResults(MNL_results)


### 推定されたパラメータを取得

In [83]:
# 推定値（β）
betas: dict[str, float] = res.get_beta_values()

### 推定されたパラメータの分散共分散行列を取得

In [86]:
# 通常（Rao–Cramer）の分散共分散行列を取得
vc = res.get_variance_covariance_matrix(
    variance_covariance_type=EstimateVarianceCovariance.RAO_CRAMER # ROBUSTを指定すると，ロバスト値を得る
)


### 統計指標の計算と出力

In [89]:
# 標準誤差と t 値を計算
names = list(betas.keys())  # dict型の変数betasに対して，keysメソッドを実行
values = np.array([betas[n] for n in names], dtype=float)
se = np.sqrt(np.diag(np.asarray(vc, dtype=float)))
t_vals = values / se

# p 値（正規近似）
from math import erf, sqrt
def p_from_t(t):
    Phi = 0.5 * (1.0 + erf(abs(t) / sqrt(2.0)))  # 一部erfを用いて標準正規分布の累積分布関数を設定
    return 2.0 * (1.0 - Phi)  # 両側検定

p_vals = [p_from_t(t) for t in t_vals]

# 表として出力
df_nonrobust = pd.DataFrame(
    {"Value": values,
     "Std err. (non-robust)": se,
     "t-test (non-robust)": t_vals,
     "p-value (non-robust)": p_vals},
    index=names
)
display(df_nonrobust)


Unnamed: 0,Value,Std err. (non-robust),t-test (non-robust),p-value (non-robust)
ASC_car,2.465207,0.005218,472.440131,0.0
B_time,-0.155094,0.000248,-625.896909,0.0
B_cost,-0.014733,0.00012,-122.731146,0.0
ASC_rail,0.665536,0.006766,98.370786,0.0


## 考察の準備

In [128]:
print(betas)

{'ASC_car': 2.46520686356558, 'B_time': -0.15509366774385386, 'B_cost': -0.01473261969528295, 'ASC_rail': 0.6655360065406377}


### 推定されたパラメータを取得して変数に代入

In [131]:
res_ASC_car = betas['ASC_car']
res_ASC_rail = betas['ASC_rail']
res_B_time = betas['B_time']
res_B_cost = betas['B_cost']

### 推定されたパラメータで予測値と的中率を計算

In [134]:
df['pred_V_car'] = res_ASC_car + res_B_time * df['car_time'] / 10 + res_B_cost * df['car_cost'] / 100
df['pred_V_rail'] = res_ASC_rail + res_B_time * df['rail_time'] / 10 + res_B_cost * df['rail_cost'] / 100
df['pred_V_bus'] = res_B_time * df['bus_time'] / 10 + res_B_cost * df['bus_cost'] / 100

In [168]:
den = np.exp(df['pred_V_car']) + np.exp(df['pred_V_rail']) + np.exp(df['pred_V_bus'])
df['pred_P_car'] = np.exp(df['pred_V_car']) / den
df['pred_P_rail'] = np.exp(df['pred_V_rail']) / den
df['pred_P_bus'] = np.exp(df['pred_V_bus']) / den

In [170]:
cols = ['pred_P_car', 'pred_P_rail', 'pred_P_bus']
code_map = {'pred_P_car': 5, 'pred_P_rail': 2, 'pred_P_bus': 4}

df['pred_mode'] = df[cols].idxmax(axis=1).map(code_map).astype('Int64')

In [171]:
df['judge'] = np.where(df['pred_mode'].astype('Int64').eq(df['mode_code'].astype('Int64')),
                       'correct', 'wrong')

In [174]:
print( ((df["judge"] == "correct").sum() + (df["judge"] == "wrong").sum()) == len(df))
acc = (df["judge"] == "correct").sum() / ((df["judge"] == "correct").sum() + (df["judge"] == "wrong").sum()) * 100
print(f"的中率: {acc}%")

True
的中率: 82.66884228875203%
