In [114]:
import numpy as np

In [115]:
class AgentBase:
    def __init__(self, id, n_options=2, q_init=0.1):
        # properties
        self.id = id

        # arrays to hold behavioural history
        self.Q_vals = [np.array([q_init for i in range(n_options)])]
        self.choices = []
        self.correct = []
        self.payoffs = []
        
        self.n_actions = n_options # TODO hmm

    def update_state(self, reward, correct):
        self.payoffs.append(reward)
        self.correct.append(correct)

    def __repr__(self):
        return f"Agent {self.id}"
    

class AgentEWA(AgentBase):
    def __init__(self, id, phi=0.1, lam=2):
        super().__init__(id)
        self.phi = phi
        self.lam = lam
        self.choice_f = lambda x: np.exp(x)/sum(np.exp(x)) # TODO parameter?

    def choose_action(self) -> int:
        choice_probabilities = self.choice_f(self.Q_vals[-1])
        choice = np.random.choice(self.n_actions, 1, p=choice_probabilities).item()
        self.choices.append(choice)
        
        return choice

    def update_Qvals(self, choice, reward):
        last_q = self.Q_vals[-1].copy() # get most recent Qvals
        #update all choices?
        for idx, val in enumerate(last_q):
            if idx == choice:
                last_q[idx] = (1 - self.phi) * last_q[choice] + self.phi * reward 
            else:
                last_q[idx] = (1 - self.phi) * last_q[choice] + self.phi * 0

        self.Q_vals.append(last_q)
    

class AgentQ(AgentBase):
    def __init__(self, id, epsilon=0.5):
        super().__init__(id)
        self.epsilon = epsilon

    def choose_action(self) -> int:
        q_vals_current = self.Q_vals[-1]
        p = np.random.uniform()
        # choose random with p=epsilon otherwise greedy
        if p < self.epsilon:
            choice = np.random.randint(0, self.n_actions)
        else:
            choice = np.argmax(q_vals_current)

        self.choices.append(choice)
        
        return int(choice)
    
    def update_Qvals(self, choice, reward):
        last_q = self.Q_vals[-1].copy() # get most recent Qvals
        chosen_q = last_q[choice]
        new_q = chosen_q + (reward - chosen_q) / len(self.Q_vals) # == n of steps, >= 1
        last_q[choice] = new_q
        
        self.Q_vals.append(last_q)


In [118]:
class Env:
    def __init__(self, 
                 n_agents=1, 
                 n_options=2, 
                 payoff_structure=(0.6, 0.59, 0)
                 ):
        self.n_agents = n_agents
        self.n_options = n_options
        # payoffs (rewards)
        self.payoff_better, self.payoff_worse, \
            self.payoff_sd = payoff_structure
        self.best_action = np.random.randint(0, n_options) # best action drawn at random

    def return_rewards(self, choice_idx: int) -> tuple[float, bool]:
        assert isinstance(choice_idx, int) and (choice_idx < self.n_options)
        # TODO
        better = np.random.normal(self.payoff_better, self.payoff_sd)
        worse = np.random.normal(self.payoff_worse, self.payoff_sd)

        return (better, True) if (choice_idx == self.best_action) else (worse, False)
    
    def __repr__(self):
        return f"""MAB env\nAgents: {self.n_agents}\nSize of action space: {self.n_options}
Best idx: {self.best_action}
Rewards (high, low, SD): {self.payoff_better, self.payoff_worse, self.payoff_sd}"""


In [128]:
e = Env()
a = AgentQ("A")

for i in range(1000):
    choice = a.choose_action()
    reward, is_optimal = e.return_rewards(choice)
    a.update_state(reward, is_optimal)
    a.update_Qvals(choice, reward)


a.Q_vals[-1]

array([0.6       , 0.45437954])

In [107]:
a.__dict__

{'id': 'A',
 'Q_vals': [array([0.1, 0.1]),
  array([0.59, 0.1 ]),
  array([0.59, 0.1 ]),
  array([0.59      , 0.26666667])],
 'choices': [0, 0, 1, 0],
 'correct': [False, False, True],
 'payoffs': [0.59, 0.59, 0.6],
 'n_actions': 2,
 'epsilon': 0.5}