In [1]:
import utils
from utils import *

def default_params():
    """ These are the default parameters used in the framework. """
    return {
            # Runner parameters
            'max_steps': 120,
            'num_episodes': 1,
            'num_actions': 7, 
            'start_action': 0, 
            'correct_action': 1,    # Zero indexed 
            # Optimization parameters
            'alpha': 0.5,
            'alpha_decay': 40,
            'alpha_min': 0.001,
            # Exploration parameters
            'epsilon': 1,
            'epsilon_decay': 20,
            'epsilon_min': 0.01,    
            'gamma': 0.95,
            'plots': False,
            'noise': False,
            'surrogate': False,
            'surrogate_c_interval': 50,
            'surrogate_c_interval_min': 0,
           }

## Thompson sampling

In [2]:
import numpy as np
from scipy.stats import beta

class ThompsonSamplingAgent:
    def __init__(self, num_actions, reward_range):
        self.num_actions = num_actions
        self.alpha = np.ones(num_actions)  # Initialize alpha (successes) to 1
        self.beta = np.ones(num_actions)   # Initialize beta (failures) to 1
        self.reward_range = reward_range
        

    def choose_action(self):
        sampled_rewards = np.random.beta(self.alpha, self.beta)
        action = np.argmax(sampled_rewards)
        print('a: ',action)
        return action

    def observe_reward(self, action, reward):
        print('r: ', reward)
        # Update the Beta distribution parameters based on observed reward
        normalized_reward = (reward - self.reward_range[0]) / (self.reward_range[1] - self.reward_range[0])
        self.alpha[action] += normalized_reward
        self.beta[action] += 1 - normalized_reward

# Define the number of actions and reward range
num_actions = 7
reward_range = (-6, 0)

# Create a Thompson Sampling agent
agent = ThompsonSamplingAgent(num_actions, reward_range)

# Number of time steps
num_steps = 1000

# Main loop for interaction
for t in range(num_steps):
    # Choose an action
    chosen_action = agent.choose_action()
    
    # Simulate a reward (replace with actual reward from your environment)
    true_rewards = np.random.uniform(reward_range[0], reward_range[1], num_actions)
    observed_reward = true_rewards[chosen_action]
    
    # Observe the reward and update the agent
    agent.observe_reward(chosen_action, observed_reward)

# After interacting for a sufficient number of time steps, you can select the action with the highest Thompson sample
optimal_action = agent.choose_action()
print("Optimal Action:", optimal_action)


a:  3
r:  -3.6740022884052967
a:  0
r:  -2.720244479358529
a:  4
r:  -5.316339829169271
a:  0
r:  -5.745861927411482
a:  5
r:  -4.035048503316372
a:  6
r:  -5.056021588242745
a:  4
r:  -5.096538521621208
a:  2
r:  -3.532840675810945
a:  0
r:  -5.286356600773033
a:  0
r:  -3.4825922242579948
a:  0
r:  -5.844888644208186
a:  2
r:  -0.7662976654920044
a:  2
r:  -3.9682104749512477
a:  1
r:  -4.836787045490994
a:  5
r:  -2.945161650593666
a:  2
r:  -1.7442076371475332
a:  2
r:  -0.8622951217231893
a:  6
r:  -4.1273049421767585
a:  1
r:  -1.188706250172161
a:  3
r:  -4.391247530604307
a:  5
r:  -1.375094133683704
a:  1
r:  -1.5218151845261936
a:  1
r:  -3.7379434457256746
a:  2
r:  -2.55985432823217
a:  1
r:  -5.676832256770711
a:  5
r:  -2.7081843834669774
a:  4
r:  -4.035385942135896
a:  5
r:  -1.5909231598882316
a:  3
r:  -4.8090479777604855
a:  2
r:  -3.620490262089888
a:  5
r:  -4.599298402763544
a:  5
r:  -4.567576777538672
a:  6
r:  -1.5221036463241386
a:  3
r:  -3.1813840878782607
a

In [3]:
class ThompsonSamplingAgentTemporaryWrapper(ThompsonSamplingAgent):
    def __init__(self, *args, **kwargs):
        self.epsilon = None
        self.Q = []
        self.N = None
        self.t = None
        super().__init__(*args, **kwargs)
    
    def choose_action(self, state=None):
        return super().choose_action()
    
    def learn(self, state=None, action=None, reward=None, next_state=None):
        return super().observe_reward(action=action, reward=reward)
    
# Define the number of actions and reward range
num_actions = 7

# TODO: get reward range from method
reward_range = (-5, 0)

# Create a Thompson Sampling agent
agent = ThompsonSamplingAgentTemporaryWrapper(num_actions, reward_range)

params = default_params()

q_values_for_chart, rewards, episode_length, selected_action, reward_processor, alphas, epsilons = runner(agent=agent, params=params)
utils.print_agent_stats(agent)

optimal_action = agent.choose_action()
print("Optimal Action:", optimal_action)

a:  6
r:  -5
a:  5
r:  -4
a:  2
r:  -1
a:  3
r:  -2
a:  0
r:  -1
a:  6
r:  -5
a:  2
r:  -1
a:  0
r:  -1
a:  4
r:  -3
a:  3
r:  -2
a:  4
r:  -3
a:  0
r:  -1
a:  2
r:  -1
a:  0
r:  -1
a:  0
r:  -1
a:  2
r:  -1
a:  0
r:  -1
a:  0
r:  -1
a:  1
r:  0
a:  1
r:  0
a:  3
r:  -2
a:  2
r:  -1
a:  1
r:  0
a:  0
r:  -1
a:  1
r:  0
a:  1
r:  0
a:  2
r:  -1
a:  2
r:  -1
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  2
r:  -1
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  0
r:  -1
a:  0
r:  -1
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  2
r:  -1
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  4
r:  -3
a:  1
r:  0
a:  1
r:  0
a:  2
r:  -1
a:  1
r:  0
a:  2
r:  -1
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
r:  0
a:  1
Q-table:
[]
Number of times action was taken:
None
Total timesteps:
None
a: 