# Implementing epsilon-greedy 

# Reference: 
    
Deep Reinforcement Learning with Python

By: Sudharsan Ravichandiran



In [33]:
import gym
import gym_bandits
import numpy as np

In [34]:
# from gym import envs
# print(envs.registry)

## Creating the bandit environment

In [35]:
from bandits import BanditTwoArmedHighLowFixed
env = BanditTwoArmedHighLowFixed()

Let's check the probability distribution of the arm:

In [36]:
print(env.p_dist)

[0.8, 0.2]


In [37]:
count = np.zeros(2)

In [38]:
sum_rewards = np.zeros(2)

In [39]:
Q = np.zeros(2)

Define `num_rounds` - number of rounds (iterations):

In [40]:
num_rounds = 1

In [41]:
def epsilon_greedy(epsilon):
    
    if np.random.uniform(0,1) < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q)

## Start pulling the arm

Now, let's play the game and try to find the best arm using the epsilon-greedy method.

In [42]:
for i in range(num_rounds):
    
    #select the arm based on the epsilon-greedy method
    arm = epsilon_greedy(0.5)

    #pull the arm and store the reward and next state information
    next_state, reward, done, info = env.step(arm) 

    #increment the count of the arm by 1
    count[arm] += 1
    
    #update the sum of rewards of the arm
    sum_rewards[arm]+=reward

    #update the average reward of the arm
    Q[arm] = sum_rewards[arm]/count[arm]

In [43]:
print(Q)

[0. 0.]


In [44]:
print('The optimal arm is arm {}'.format(np.argmax(Q)+1))

The optimal arm is arm 1
