In [22]:
import random
from typing import Callable, Optional, TypedDict
import numpy as np
from obp.dataset import SyntheticBanditDataset, OpenBanditDataset
from obp.policy import BernoulliTS
from obp.ope import ReplayMethod, InverseProbabilityWeighting, BaseOffPolicyEstimator
from obp.ope import OffPolicyEvaluation, InverseProbabilityWeighting as IPW

import polars as pl

In [2]:
class BanditFeedbackDict(TypedDict):
    n_rounds: int  # ラウンド数
    n_actions: int  # アクション数s
    context: np.ndarray  # 文脈 (shape: (n_rounds, dim_context))
    action_context: np.ndarray  # アクション特徴量 (shape: (n_actions, dim_action_features))
    action: np.ndarray  # 実際に選択されたアクション (shape: (n_rounds,))
    position: Optional[np.ndarray]  # ポジション (shape: (n_rounds,) or None)
    reward: np.ndarray  # 報酬 (shape: (n_rounds,))
    expected_reward: np.ndarray  # 期待報酬 (shape: (n_rounds, n_actions))
    pi_b: np.ndarray  # データ収集方策 P(a|x) (shape: (n_rounds, n_actions))
    pscore: np.ndarray  # 傾向スコア (shape: (n_rounds,))

In [33]:
# 実際にzozotownで収集されたバンディットフィードバックデータの使い方確認

# データセットクラスのインスタンス化
dataset = OpenBanditDataset(behavior_policy="random", campaign="all")
# 属性の確認
print(f"Number of Rounds: {dataset.n_rounds}")
print(f"Number of Actions: {dataset.n_actions}")
print(f"slate size: {dataset.len_list}")

# バンディットフィードバックを辞書形式で取得
bandit_feedback, _ = dataset.obtain_batch_bandit_feedback(test_size=0.3, is_timeseries_split=True)
print(bandit_feedback.keys())

# データ収集方策の性能のon-policy評価結果
print(f"{OpenBanditDataset.calc_on_policy_policy_value_estimate('bts', 'all')}")
print(f"{OpenBanditDataset.calc_on_policy_policy_value_estimate('random', 'all')}")
print("--------------------")

# バンディットフィードバックの中身を確認
for round_idx in range(3):
    print(f"Round: {round_idx}")
    print(f"Context: {bandit_feedback['context'][round_idx]}")
    print(f"Position: {bandit_feedback['position'][round_idx]}")
    print(f"Action: {bandit_feedback['action'][round_idx]}")
    print(f"Action context: {bandit_feedback['action_context'][round_idx]}")
    print(f"Reward: {bandit_feedback['reward'][round_idx]}")
    print(f"Propensity Score: {bandit_feedback['pscore'][round_idx]}")
    print("--------------------")

INFO:obp.dataset.real:When `data_path` is not given, this class downloads the small-sized version of Open Bandit Dataset.
INFO:obp.dataset.real:When `data_path` is not given, this class downloads the small-sized version of Open Bandit Dataset.
INFO:obp.dataset.real:When `data_path` is not given, this class downloads the small-sized version of Open Bandit Dataset.


Number of Rounds: 10000
Number of Actions: 80
slate size: 3
dict_keys(['n_rounds', 'n_actions', 'action', 'position', 'reward', 'pscore', 'context', 'action_context'])
0.0042
0.0038
--------------------
Round: 0
Context: [1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1]
Position: 2
Action: 14
Action context: [ 5.         10.          4.         -0.49917163]
Reward: 0
Propensity Score: 0.0125
--------------------
Round: 1
Context: [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
Position: 2
Action: 14
Action context: [ 1.         10.          4.         -0.54377537]
Reward: 0
Propensity Score: 0.0125
--------------------
Round: 2
Context: [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0]
Position: 2
Action: 27
Action context: [ 1.         12.          1.          0.97275186]
Reward: 0
Propensity Score: 0.0125
--------------------


In [37]:
# オフ方策学習
evaluation_policy = BernoulliTS(
    n_actions=dataset.n_actions,
    len_list=dataset.len_list,
    is_zozotown_prior=False,
    # zozoが用意した事前分布のパラメータを使う場合
    # is_zozotown_prior=True,
    # campaign="all",
    # random_state=12345,
)
# パラメータ更新
for round_idx in range(bandit_feedback["n_rounds"]):
    action = bandit_feedback["action"][round_idx]
    reward = bandit_feedback["reward"][round_idx]
    evaluation_policy.update_params(action=action, reward=reward)
action_dist = evaluation_policy.compute_batch_action_dist(n_rounds=bandit_feedback["n_rounds"], n_sim=100000)
print(f"{action_dist.shape}")  # (n_rounds, n_actions, len_list)

(7000, 80, 3)


In [38]:
# オフ方策評価
ope = OffPolicyEvaluation(bandit_feedback=bandit_feedback, ope_estimators=[IPW()])
estimated_policy_value = ope.estimate_policy_values(action_dist=action_dist)

# オフ方策学習した新方策の性能のオフライン評価値と、データ収集方策の性能のオンライン評価値を比較
relative_policy_value_of_bernoulli_ts = estimated_policy_value["ipw"] / bandit_feedback["reward"].mean()
print(relative_policy_value_of_bernoulli_ts)

2.9904571428571427
