<a href="https://colab.research.google.com/github/maiquangtuan/Reinforcement-learning-for-beginer/blob/main/Multi_arm_bandit_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Multi arm bandit problem is a classical problem in reinforcement learning. You will have a set of actions to choose. Each action will return a reward that come from a stationary distribution. We will build an agent to maximixe the total reward 

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In this setting, we define an environment for the agent where if the agent take action a in action space, the reward will be zero or one base on a stationary distribution

In [None]:
class Environment():

  def __init__(self, probs):
    self.probs = probs 
  
  def step(self, action):

    return 1 if (np.random.random() < self.probs[action]) else 0

In [None]:
class Agent():
  def __init__(self, n_actions, eps):
    self.n_actions = n_actions 
    self.eps = eps 
    self.n = np.zeros(n_actions, dtype = int)
    self.q = np.zeros(n_actions, dtype = float)

  def update_Q(self, action, reward):
    self.n[action] += 1 
    self.q[action] += (1.0/self.n[action])*(reward - self.q[action])
  
  def get_action(self):
    if np.random.random() < self.eps:
      return np.random.randint(self.n_actions)
    else:
      return np.random.choice(np.flatnonzero(self.q == self.q.max()))

In [None]:
def experiment(probs, N_episodes):
    env = Environment(probs) # initialize arm probabilities
    agent = Agent(len(env.probs), eps)  # initialize agent
    actions, rewards = [], []
    for episode in range(N_episodes):
        action = agent.get_action() # sample policy
        reward = env.step(action) # take step + get reward
        agent.update_Q(action, reward) # update Q
        actions.append(action)
        rewards.append(reward)
    return np.array(actions), np.array(rewards)

In [None]:
import os
probs = [0.10, 0.50, 0.60, 0.80, 0.10,
         0.25, 0.60, 0.45, 0.75, 0.65] # bandit arm probabilities of success
N_experiments = 10000 # number of experiments to perform
N_steps = 500 # number of steps (episodes)
eps = 0.1 # probability of random exploration (fraction)
save_fig = True # save file in same directory
output_dir = os.path.join(os.getcwd(), "output")

# Run multi-armed bandit experiments
print("Running multi-armed bandits with nActions = {}, eps = {}".format(len(probs), eps))
R = np.zeros((N_steps,))  # reward history sum
A = np.zeros((N_steps, len(probs)))  # action history sum
for i in range(N_experiments):
    actions, rewards = experiment(probs, N_steps)  # perform experiment
    if (i + 1) % (N_experiments / 100) == 0:
        print("[Experiment {}/{}] ".format(i + 1, N_experiments) +
              "n_steps = {}, ".format(N_steps) +
              "reward_avg = {}".format(np.sum(rewards) / len(rewards)))
    R += rewards
    for j, a in enumerate(actions):
        A[j][a] += 1

# Plot reward results
R_avg =  R / np.float(N_experiments)
plt.plot(R_avg, ".")
plt.xlabel("Step")
plt.ylabel("Average Reward")
plt.grid()
ax = plt.gca()
plt.xlim([1, N_steps])
if save_fig:
    if not os.path.exists(output_dir): os.mkdir(output_dir)
    plt.savefig(os.path.join(output_dir, "rewards.png"), bbox_inches="tight")
else:
    plt.show()
plt.close()

# Plot action results
for i in range(len(probs)):
    A_pct = 100 * A[:,i] / N_experiments
    steps = list(np.array(range(len(A_pct)))+1)
    plt.plot(steps, A_pct, "-",
             linewidth=4,
             label="Arm {} ({:.0f}%)".format(i+1, 100*probs[i]))
plt.xlabel("Step")
plt.ylabel("Count Percentage (%)")
leg = plt.legend(loc='upper left', shadow=True)
plt.xlim([1, N_steps])
plt.ylim([0, 100])
for legobj in leg.legendHandles:
    legobj.set_linewidth(4.0)
if save_fig:
    if not os.path.exists(output_dir): os.mkdir(output_dir)
    plt.savefig(os.path.join(output_dir, "actions.png"), bbox_inches="tight")
else:
    plt.show()
plt.close()

Running multi-armed bandits with nActions = 10, eps = 0.1
[Experiment 100/10000] n_steps = 500, reward_avg = 0.638
[Experiment 200/10000] n_steps = 500, reward_avg = 0.694
[Experiment 300/10000] n_steps = 500, reward_avg = 0.7
[Experiment 400/10000] n_steps = 500, reward_avg = 0.67
[Experiment 500/10000] n_steps = 500, reward_avg = 0.754
[Experiment 600/10000] n_steps = 500, reward_avg = 0.742
[Experiment 700/10000] n_steps = 500, reward_avg = 0.782
[Experiment 800/10000] n_steps = 500, reward_avg = 0.73
[Experiment 900/10000] n_steps = 500, reward_avg = 0.758
[Experiment 1000/10000] n_steps = 500, reward_avg = 0.714
[Experiment 1100/10000] n_steps = 500, reward_avg = 0.722
[Experiment 1200/10000] n_steps = 500, reward_avg = 0.73
[Experiment 1300/10000] n_steps = 500, reward_avg = 0.676
[Experiment 1400/10000] n_steps = 500, reward_avg = 0.78
[Experiment 1500/10000] n_steps = 500, reward_avg = 0.738
[Experiment 1600/10000] n_steps = 500, reward_avg = 0.786
[Experiment 1700/10000] n_ste