In [7]:
import numpy as np

In [14]:
class EWAagent:
    def __init__(self, id, phi=0.1, lam=2, n_options=2, q_init=0.1):
        # properties
        self.id = id
        self.phi = phi
        self.lam = lam
        self.choice_f = lambda x: np.exp(x)/sum(np.exp(x)) # TODO parameter?

        # arrays to hold behavioural history
        self.Q_vals = [np.array([q_init for i in range(n_options)])]
        self.choices = []
        self.correct = []
        self.payoffs = []
        
        self.n_actions = n_options # TODO hmm

    def update_Q_vals(self, choice, reward):
        last_q = self.Q_vals[-1].copy() # get current rewards
        #update all choices?
        for idx, val in enumerate(last_q):
            if idx == choice:
                last_q[idx] = (1 - self.phi) * last_q[choice] + self.phi * reward 
            else:
                last_q[idx] = (1 - self.phi) * last_q[choice] + self.phi * 0
        
        self.Q_vals.append(last_q)

    def update_state(self, reward, correct):
        self.payoffs.append(reward)
        self.correct.append(correct)
        
    def choose_action(self) -> int:
        choice_probabilities = self.choice_f(self.Q_vals[-1])
        choice = np.random.choice(self.n_actions, 1, p=choice_probabilities).item()
        self.choices.append(choice)
        
        return choice

    def __repr__(self):
        return f"Agent {self.id}"
    

In [27]:
max([np.random.randint(0, 3) for i in range(1000)])

2

In [15]:
class Env:
    def __init__(self, 
                 n_agents=1, 
                 n_options=2, 
                 payoff_structure=(0.6, 0.59, 0)
                 ):
        self.n_agents = n_agents
        self.n_options = n_options
        # payoffs
        self.payoff_better, self.payoff_worse, \
            self.payoff_sd = payoff_structure
        self.optimal_action = np.random.randint(0, n_options) # best action drawn at random

    def return_rewards(self, choice_idx: int) -> tuple[float, bool]:
        assert isinstance(choice_idx, int) and (choice_idx < self.n_options)
        # TODO
        better = np.random.normal(self.payoff_better, self.payoff_sd)
        worse = np.random.normal(self.payoff_worse, self.payoff_sd)

        return (better, True) if (choice_idx == self.optimal_action) else (worse, False)
    
    def __repr__(self):
        return f"""MAB env\nAgents: {self.n_agents}\nSize of action space: {self.n_options}
Rewards (high, low, SD): {self.payoff_better, self.payoff_worse, self.payoff_sd}"""


In [21]:
e = Env()
a = EWAagent("A")

for i in range(1000):
    choice = a.choose_action()
    reward, is_optimal = e.return_rewards(choice)
    a.update_state(reward, is_optimal)
    a.update_Q_vals(choice, reward)


a.Q_vals[-1]

array([0.35472462, 0.41472462])

In [11]:
np.full([4,3], 1)

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])