# Implementing Thompson sampling

Now, let's learn how to implement the Thompson sampling method to find the best arm.


# Reference: 
    
Deep Reinforcement Learning with Python

By: Sudharsan Ravichandiran

In [28]:
import gym
import gym_bandits
import numpy as np

## Creating the bandit environment

In [29]:
from bandits import BanditTwoArmedHighLowFixed
env = BanditTwoArmedHighLowFixed()

Let's check the probability distribution of the arm:

In [30]:
print(env.p_dist)

[0.8, 0.2]


In [31]:
count = np.zeros(2)

In [32]:
sum_rewards = np.zeros(2)

In [33]:
Q = np.zeros(2)

In [34]:
num_rounds = 300

In [35]:
alpha = np.ones(2)

In [36]:
beta = np.ones(2)

In [37]:
def thompson_sampling(alpha,beta):
    
    samples = [np.random.beta(alpha[i]+1,beta[i]+1) for i in range(2)]

    return np.argmax(samples)

In [38]:
for i in range(num_rounds):
    
    #select the arm based on the thompson sampling method
    arm = thompson_sampling(alpha,beta)

    #pull the arm and store the reward and next state information
    next_state, reward, done, info = env.step(arm) 

    #increment the count of the arm by 1
    count[arm] += 1
    
    #update the sum of rewards of the arm
    sum_rewards[arm]+=reward

    #update the average reward of the arm
    Q[arm] = sum_rewards[arm]/count[arm]

    #if we win the game, that is, if the reward is equal to 1, then we update the value of alpha as 
    #alpha = alpha + 1 else we update the value of beta as beta = beta + 1
    if reward==1:
        alpha[arm] = alpha[arm] + 1
    else:
        beta[arm] = beta[arm] + 1
    

In [39]:
print(Q)

[0.78716216 0.        ]


In [40]:
print('The optimal arm is arm {}'.format(np.argmax(Q)+1))

The optimal arm is arm 1
