In [53]:
import random
import math
import matplotlib.pyplot as plt

In [54]:
class KArmBandit:
    def __init__(self, k, aligned_articles, epsilon = 0.1, ucb_c = 1.0):
        self.k = k #number of arms
        self.aligned_articles = aligned_articles #list of aligned articles
        self.epsilon = epsilon #epsilon value
        self.ucb_c = ucb_c #upper confidence bound
        self.q_values = [0.0]*k #list to store estimated q(reward) values
        self.action_counts = [0]*k #list to store how many times an article is chosen
        self.time_step = 0
        
    def choose_action_epsilon_greedy(self):
        if random.random() < self.epsilon:
            #Exploration: choosing a random article
            return random.choice(range(self.k))
        else:
            #Exploitation: choosing the article which has highest estimated q value
            return max(range(self.k), key=lambda i: self.q_values[i])
        
    def choose_action_ucb(self):
        self.time_step += 1
        #UCB formula: q(a) + sqrt(2log(t)/N(a))
        #N(a) = number of times an action is chosen
        # 1e-6 is added in the denominator to avoid division by 0 error
        ucb_values = [self.q_values[i] + self.ucb_c*math.sqrt(math.log(self.time_step)/(self.action_counts[i] + 1e-6))
                     for i in range(self.k)]
        return max(range(self.k), key = lambda i: ucb_values[i])
    
    def update(self, action, reward):
        self.action_counts[action] += 1
        self.q_values[action] += (reward - self.q_values[action])/self.action_counts[action]
        


In [57]:
k = 5
aligned_articles = [0,1]
bandit_epsilon = KArmBandit(k, aligned_articles, epsilon=0.1)
bandit_ucb = KArmBandit(k, aligned_articles, ucb_c=2)
trials = 1000
aligned_shown_epsilon = 0
aligned_shown_ucb = 0

In [58]:
for t in range(trials):
    #Reward distribution, aligned articles get higher reward around 5 whereas other articles get around 1
    rewards = [random.gauss(5 if i in aligned_articles else 1, 1) for i in range(k)]
    
    action_epsilon = bandit_epsilon.choose_action_epsilon_greedy()
    bandit_epsilon.update(action_epsilon, rewards[action_epsilon])
    if action_epsilon in aligned_articles:
        aligned_shown_epsilon +=1 
        
    action_ucb = bandit_ucb.choose_action_ucb()
    bandit_ucb.update(action_ucb, rewards[action_ucb])
    if action_ucb in aligned_articles:
        aligned_shown_ucb += 1
        

In [60]:
print("Epsilon-Greedy:")
print("Estimated Q-values:", bandit_epsilon.q_values)
print(f"Aligned Articles Shown: {aligned_shown_epsilon}/{trials} times.")

print("\nUCB:")
print("Estimated Q-values:", bandit_ucb.q_values)
print(f"Aligned Articles Shown: {aligned_shown_ucb}/{trials} times.")

Epsilon-Greedy:
Estimated Q-values: [5.060048276765348, 4.649084798891544, 0.8354181986868464, 0.9531421471603001, 0.9748118624345828]
Aligned Articles Shown: 941/1000 times.

UCB:
Estimated Q-values: [5.0721823912347, 4.930348747231807, 1.2797468153440468, -0.6393614603305118, -0.12279132587352914]
Aligned Articles Shown: 996/1000 times.
