In [1]:
import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.utils import check_random_state
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib

plt.style.use("ggplot")
y_label_dict = {"se": "左図：平均二乗誤差", "bias": "中図：二乗バイアス", "variance": "右図：バリアンス"}

# import open bandit pipeline (obp)
import obp
from obp.dataset import (
    SyntheticBanditDatasetWithActionEmbeds as SyntheticBanditDataset,
    logistic_polynomial_reward_function,
)
from obp.ope import (
    OffPolicyEvaluation,
    RegressionModel,
    InverseProbabilityWeighting as IPS,
    DirectMethod as DM,
    DoublyRobust as DR,
)
from utils import eps_greedy_policy, aggregate_simulation_results

In [2]:
print(obp.__version__)

0.5.7


## (データ収集方策が収集した)ログデータのサイズを変化させたときのDM・IPS・DR推定量の平均二乗誤差・二乗バイアス・バリアンスの挙動


In [9]:
## シミュレーション設定
num_runs = 500  # シミュレーションの繰り返し回数
dim_context = 10  # 特徴量xの次元
n_actions = 20  # 行動数, |A|
beta = 1  # データ収集方策のパラメータ（-3から1に変更：より安定的な値）
test_data_size = 100000  # 評価方策の真の性能を近似するためのテストデータのサイズ
random_state = 12345
random_ = check_random_state(random_state)
num_data_list = [250, 500, 1000, 2000, 4000, 8000]  # データ収集方策が収集したログデータのサイズ

In [13]:
result_df_list = []
for num_data in num_data_list:
    ## 人工データ生成クラス
    dataset = SyntheticBanditDataset(
        n_actions=n_actions,
        dim_context=dim_context,
        action_context=random_.normal(size=(n_actions, 10), scale=0.5),  # scaleを0.5に小さくして安定化
        beta=beta,
        reward_function=logistic_polynomial_reward_function,
        random_state=random_state,
    )

    ## 評価方策の真の性能(policy value)を近似するためのテストデータ
    test_data = dataset.obtain_batch_bandit_feedback(n_rounds=test_data_size)

    ## 評価方策の真の性能(policy value)を近似
    policy_value = dataset.calc_ground_truth_policy_value(
        expected_reward=test_data["expected_reward"],
        action_dist=eps_greedy_policy(test_data["expected_reward"]),
    )

    estimated_policy_value_list = []
    for _ in tqdm(range(num_runs), desc=f"num_data={num_data}..."):
        ## データ収集方策が形成する分布に従いログデータを生成
        offline_logged_data = dataset.obtain_batch_bandit_feedback(n_rounds=num_data)

        ## ログデータ上における評価方策の行動選択確率を計算
        pi = eps_greedy_policy(offline_logged_data["expected_reward"])

        ## 期待報酬関数に対する推定モデル\hat{q}(x,a)を得る
        reg_model = RegressionModel(
            n_actions=dataset.n_actions,
            base_model=LogisticRegression(C=100, random_state=random_state),
        )
        estimated_rewards_lr = reg_model.fit_predict(
            context=offline_logged_data["context"],  # context; x
            action=offline_logged_data["action"],  # action; a
            reward=offline_logged_data["reward"],  # reward; r
            random_state=random_state,
        )
        reg_model = RegressionModel(
            n_actions=dataset.n_actions,
            base_model=MLPClassifier(hidden_layer_sizes=(10, 10), random_state=random_state),
        )
        estimated_rewards_mlp = reg_model.fit_predict(
            context=offline_logged_data["context"],  # context; x
            action=offline_logged_data["action"],  # action; a
            reward=offline_logged_data["reward"],  # reward; r
            random_state=random_state,
        )

        ## ログデータを用いてオフ方策評価を実行する
        ope = OffPolicyEvaluation(
            bandit_feedback=offline_logged_data,
            ope_estimators=[
                IPS(estimator_name="IPS"),
                DR(estimator_name="DR"),
                DM(estimator_name="lr"),
                DM(estimator_name="mlp"),
            ],
        )
        estimated_policy_values = ope.estimate_policy_values(
            action_dist=pi,  # \pi(a|x)
            estimated_rewards_by_reg_model={
                "DR": estimated_rewards_mlp,
                "lr": estimated_rewards_lr,
                "mlp": estimated_rewards_mlp,
            },
        )
        estimated_policy_value_list.append(estimated_policy_values)

    ## シミュレーション結果を集計する
    result_df_list.append(
        aggregate_simulation_results(
            estimated_policy_value_list,
            policy_value,
            "num_data",
            num_data,
        )
    )
result_df = pd.concat(result_df_list).reset_index(level=0)

num_data=250...:   6%|▌         | 31/500 [00:03<00:51,  9.13it/s]


ValueError: p < 0, p > 1 or p contains NaNs