# Experiment 3

## Problem Statement:

Implement a python program to solve the muti-armed bandit problem using the Upper Confidence Bound Algorithm. Compare the reward obtained with random sampling. 

## Code

In [1]:
# importing required libraries
import numpy as np

In [2]:
# implementing the bandit class to create a bandit object
class Bandit:
    def __init__(self, name, estimated_reward):
        self.name = name
        self.estimated_reward = estimated_reward
        self.estimated_mean_reward = 0
        self.n = 0

    def pull(self):
        return np.random.randn() + self.estimated_reward
        # return np.random.randint(1, 10)

    def update(self, reward):
        self.n += 1
        self.estimated_mean_reward = (1 - 1.0/self.n) * \
            self.estimated_mean_reward + 1.0/self.n * reward

In [3]:
# implementing exploration by random sampling
def random_explore(bandits, num_iterations):
    for i in range(num_iterations):
        j = np.random.randint(0, len(bandits))
        x = bandits[j].pull()
        bandits[j].update(x)

In [4]:
# implementing exploration by mean sampling
def mean_explore(bandits, num_iterations):
    for i in range(num_iterations):
        j = np.argmax([b.estimated_mean_reward for b in bandits])
        x = bandits[j].pull()
        bandits[j].update(x)

In [5]:

# implementing the greedy_exploit function to return the best bandit
def greedy_exploit(bandits):
    return max(bandits, key=lambda x: x.estimated_mean_reward)

# implementing the epsilon greedy function to return the best bandit
def epsilon_greedy_exploit(bandits, epsilon=0.1):
    if np.random.random() < epsilon:
        # Randomly select a bandit with probability epsilon (exploration)
        return np.random.choice(bandits)
    else:
        # Exploit the bandit with the highest current estimated reward with probability 1-epsilon
        return max(bandits, key=lambda x: x.estimated_reward)

In [6]:
# initialize machines A, B, C, D, E with true means between 5 and 10
np.random.seed(5)
machines = ['A', 'B', 'C', 'D', 'E']
# bandits = [Bandit(machine, 5 + 5 * np.random.rand()) for machine in machines]
estimated_rewards = [4, 3, 5, 7, 2]
bandits = [Bandit(machine, mean_reward) for machine, mean_reward in zip(machines, estimated_rewards)]

In [7]:
# random_explore(bandits, 1000)
# print("Bandit Details Using Random\n")
# for bandit in bandits:
#     print("Bandit:", bandit.name, "Number of Times: ", bandit.n,"Estimated Mean Reward: ",
#           bandit.estimated_mean_reward)

In [8]:
# initial pull for each bandit
all_rewards = []
for bandit in bandits:
    reward = bandit.pull()
    bandit.update(reward)
    all_rewards.append(reward)

# Number of trials
N = 1000 - len(bandits)
for _ in range(N):
    chosen_bandit = epsilon_greedy_exploit(bandits)
    reward = chosen_bandit.pull()
    chosen_bandit.update(reward)
    all_rewards.append(reward)

# calculating mean and cumulative reward
mean_rewards = [np.mean(all_rewards[:i+1]) for i in range(len(all_rewards))]
cumulative_rewards = np.cumsum(all_rewards).tolist()

print("Bandit Details Using Greedy Algorithm \n")
for bandit in bandits:
    print(
        f"Machine {bandit.name} - Number of Times {bandit.n} Estimated Reward: {bandit.estimated_reward:.2f}, Estimated Mean Reward: {bandit.estimated_mean_reward:.2f}")

print(f"\nFinal Mean Reward: {mean_rewards[-1]:.2f}")
print(f"Final Cumulative Reward: {cumulative_rewards[-1]:.2f}")

Bandit Details Using Greedy Algorithm 

Machine A - Number of Times 21 Estimated Reward: 4.00, Estimated Mean Reward: 4.42
Machine B - Number of Times 30 Estimated Reward: 3.00, Estimated Mean Reward: 2.84
Machine C - Number of Times 17 Estimated Reward: 5.00, Estimated Mean Reward: 5.33
Machine D - Number of Times 914 Estimated Reward: 7.00, Estimated Mean Reward: 7.03
Machine E - Number of Times 18 Estimated Reward: 2.00, Estimated Mean Reward: 2.24

Final Mean Reward: 6.73
Final Cumulative Reward: 6734.47
