<a href="https://colab.research.google.com/github/oreilly-japan/ml-at-work/blob/master/chap11/02_bandit_algorithm_compare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![表紙](https://www.oreilly.co.jp/books/images/picture978-4-87311-947-2.gif)

このノートブックはオライリー・ジャパンより発行の書籍[『仕事ではじめる機械学習 第2版』](https://www.oreilly.co.jp/books/9784873119472/)のサンプルコードです。コードの解説等は書籍をご参照ください。なお、このコードを動作させた結果について、著者およびオライリー・ジャパンは一切の責任を負いません。

## 11.7 各種バンディットアルゴリズムの比較

In [3]:
import numpy as np
import scipy
import scipy.stats
import matplotlib.pyplot as plt

In [4]:
def random_strategy(success_counts, fail_counts):
    return np.random.choice(len(success_counts))

def baysian_ucb_strategy(success_counts, fail_counts, q=0.95):
    score = scipy.stats.beta.ppf(q, success_counts + 1, fail_counts + 1)
    return np.argmax(score)

def ucb1_strategy(success_counts, fail_counts):
    mean = (success_counts) / (success_counts + fail_counts)
    total_count = np.sum(success_counts + fail_counts)
    ucb = (
            2.0 * np.log(total_count) / (success_counts + fail_counts)
          ) ** 0.5
    score = mean + ucb
    return np.argmax(score)

def softmax_strategy(success_counts, fail_counts, t=0.05):
    mean = (success_counts) / (success_counts + fail_counts)
    select_rate = np.exp(mean / t)/ np.sum(np.exp(mean / t))
    return np.random.choice(len(select_rate), p=select_rate)

def softmax_annealing_strategy(success_counts, fail_counts, initial_t=0.1, k=100.0):
    mean = (success_counts) / (success_counts + fail_counts)
    t = initial_t / np.log(k * np.sum(success_counts + fail_counts) + 2)
    select_rate = np.exp(mean / t) / np.sum(np.exp(mean / t))
    return np.random.choice(len(select_rate), p=select_rate)

def thompson_sampling_strategy(success_counts, fail_counts):
    score = scipy.stats.beta.rvs(success_counts + 1, fail_counts + 1)
    return np.argmax(score)

In [None]:
actual_cvr = [0.12, 0.11, 0.10]
bandit_round = 10000
#random_seed = 1234567
random_seed = 1234

strategy_list = [
    ("Random", random_strategy),
    ("Baysian_UCB", baysian_ucb_strategy),
    ("UCB1", ucb1_strategy),
    ("Softmax", softmax_strategy),
    ("Softmax_annealing", softmax_annealing_strategy),
    ("Thompson_sampling", thompson_sampling_strategy)
]

scores = []
arm1_select_rates = []

for name, select_arm_method in strategy_list:
    # random seedを初期化する
    # scipyはnumpyの乱数を使っているので、scipy側もこれで初期化できる
    np.random.seed(random_seed)

    success_counts = np.array([0.0, 0.0, 0.0])
    fail_counts = np.array([0.0, 0.0, 0.0])
    
    scores.append([])
    arm1_select_rates.append([])
    
    for i in range(bandit_round):
        if i < 1000: # 最初の1000ラウンドはランダムに配信する
            selected_arm = random_strategy(success_counts, fail_counts)
        else:
            selected_arm = select_arm_method(success_counts, fail_counts)

        # 選んだアームがコンバージョンしていたかどうかを判定
        if np.random.rand() < actual_cvr[selected_arm]:
            success_counts[selected_arm] += 1
        else:
            fail_counts[selected_arm] += 1
        
        score = np.sum(success_counts) / np.sum(success_counts + fail_counts)
        scores[-1].append(score)
        
        arm1_select_rate = (success_counts[0] + fail_counts[0]) / np.sum(success_counts + fail_counts)
        arm1_select_rates[-1].append(arm1_select_rate)

In [None]:
plt.figure(dpi=400)
for i in range(len(strategy_list)):
    algorithm_name = strategy_list[i][0] 
    print(algorithm_name, scores[i][-1])
    plt.plot(scores[i], label=algorithm_name)
    plt.ylim(0.0, 0.2)

plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
plt.show()    

In [None]:
plt.figure(dpi=400)
for i in range(len(strategy_list)):
    algorithm_name = strategy_list[i][0] 
    print(algorithm_name, arm1_select_rates[i][-1])
    plt.plot(arm1_select_rates[i], label=algorithm_name)
    plt.ylim(0, 1)

plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
plt.show()