In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
from monitor import interact
import gym
import numpy as np

In [19]:
from collections import defaultdict

class Agent:

    def __init__(self, nA=6, eps=0.005, alpha=1, gamma=1.0):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.eps = eps
        
        self.alpha = alpha
        self.gamma = gamma
        
    def select_action(self, state):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        if np.random.random() > self.eps:
            return np.argmax(self.Q[state])
        else:
            return np.random.choice(self.nA)

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        current = self.Q[state][action]
        greedy_action = np.argmax(self.Q[next_state])
        
        policy_probs = np.ones(self.nA) * (self.eps / self.nA)
        policy_probs[greedy_action] += (1-self.eps)
        
        expected_Q = sum(policy_probs * self.Q[next_state])
        
        self.Q[state][action] = current + (self.alpha * (reward + (self.gamma * expected_Q) - current))

In [4]:
env = gym.make('Taxi-v3')

In [5]:
# https://wooono.tistory.com/102

from bayes_opt import BayesianOptimization

# Set of hyperparemeters
pbounds = {'eps': (0.001, 0.1),
            'alpha': (0.1, 1),
            'gamma': (0.3, 1),
            }

# Function to optimize
def opt_function(eps, alpha, gamma):
    agent = Agent(eps=eps, alpha=alpha, gamma=gamma)
    _, best_avg_reward = interact(env, agent, verbose=False)
    
    return best_avg_reward

# Bayesian optimization Object
# f : Target Function, pbounds : hyperparameter set
# verbose = 2 Print always, verbose = 1 Print when find maximum, verbose = 0 No Print
# random_state : Seed
bo=BayesianOptimization(f=opt_function, pbounds=pbounds, verbose=2, random_state=47)    

# Run target value maximize
# init_points :  Initial Random Search Counts
# n_iter : The number of Iteration
# acq : Acquisition Functions - Expected Improvement(EI), Probability of Improvement(PI), Upper Confidence Bound(UCB)
# xi : Exploration Strength
bo.maximize(init_points=3, n_iter=50, acq='ei', xi=0.01)

print(bo.max)

|   iter    |  target   |   alpha   |    esp    |   gamma   |
-------------------------------------------------------------
| [0m 1       [0m | [0m 4.82    [0m | [0m 0.2021  [0m | [0m 0.09747 [0m | [0m 0.8101  [0m |
| [95m 2       [0m | [95m 6.15    [0m | [95m 0.4163  [0m | [95m 0.07105 [0m | [95m 0.8597  [0m |
| [95m 3       [0m | [95m 7.27    [0m | [95m 0.681   [0m | [95m 0.04205 [0m | [95m 0.7942  [0m |
| [0m 4       [0m | [0m 6.53    [0m | [0m 0.9757  [0m | [0m 0.05574 [0m | [0m 0.9547  [0m |
| [0m 5       [0m | [0m 5.0     [0m | [0m 0.786   [0m | [0m 0.08894 [0m | [0m 0.9198  [0m |
| [0m 6       [0m | [0m 4.68    [0m | [0m 0.3885  [0m | [0m 0.08556 [0m | [0m 0.6104  [0m |
| [0m 7       [0m | [0m 6.98    [0m | [0m 0.9826  [0m | [0m 0.05415 [0m | [0m 0.8778  [0m |
| [95m 8       [0m | [95m 7.75    [0m | [95m 0.6564  [0m | [95m 0.03034 [0m | [95m 0.7665  [0m |
| [95m 9       [0m | [95m 8.92    [0m | 

In [15]:
from collections import defaultdict

class Agent2:

    def __init__(self, nA=6, eps=0.005, eps_decay=0.9, alpha=1, gamma=1.0):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.eps = eps
        self.eps_decay = eps_decay
        
        self.alpha = alpha
        self.gamma = gamma
        
    def get_policy_probs(self, state):
        greedy_action = np.argmax(self.Q[state])        
        policy_probs = np.ones(self.nA) * (self.eps / self.nA)
        policy_probs[greedy_action] += (1-self.eps)
        
        return policy_probs
        
    def select_action(self, state):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        return np.random.choice(self.nA, p=self.get_policy_probs(state))

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        current = self.Q[state][action]
        policy_probs = self.get_policy_probs(next_state)
        expected_Q = sum(policy_probs * self.Q[next_state])
        
        self.Q[state][action] = current + (self.alpha * (reward + (self.gamma * expected_Q) - current))
        
        if done:
            self.eps = self.eps * self.eps_decay

In [17]:
pbounds2 = {'eps': (0.001, 0.1),
            'eps_decay': (0.1, 0.9),
            'alpha': (0.1, 1),
            'gamma': (0.3, 1),
            }

def opt_function2(eps, eps_decay, alpha, gamma):
    agent = Agent2(eps=eps, eps_decay=eps_decay, alpha=alpha, gamma=gamma)
    _, best_avg_reward = interact(env, agent, verbose=False)
    
    return best_avg_reward

bo2 = BayesianOptimization(f=opt_function2, pbounds=pbounds2, verbose=2, random_state=47)    
bo2.maximize(init_points=3, n_iter=30, acq='ei', xi=0.01)
print(bo2.max)

|   iter    |  target   |   alpha   |    eps    | eps_decay |   gamma   |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 9.11    [0m | [0m 0.2021  [0m | [0m 0.09747 [0m | [0m 0.683   [0m | [0m 0.546   [0m |
| [0m 2       [0m | [0m 8.54    [0m | [0m 0.7368  [0m | [0m 0.08016 [0m | [0m 0.6164  [0m | [0m 0.5902  [0m |
| [0m 3       [0m | [0m 8.73    [0m | [0m 0.7354  [0m | [0m 0.02542 [0m | [0m 0.3048  [0m | [0m 0.3168  [0m |
| [0m 4       [0m | [0m 9.0     [0m | [0m 0.215   [0m | [0m 0.09028 [0m | [0m 0.6996  [0m | [0m 0.5469  [0m |
| [0m 5       [0m | [0m 8.73    [0m | [0m 0.2044  [0m | [0m 0.09766 [0m | [0m 0.6244  [0m | [0m 0.5422  [0m |
| [0m 6       [0m | [0m 8.83    [0m | [0m 0.2266  [0m | [0m 0.07713 [0m | [0m 0.7246  [0m | [0m 0.5572  [0m |
| [0m 7       [0m | [0m 8.7     [0m | [0m 0.1849  [0m | [0m 0.1     [0m | [0m 0.6913  [0m | [0m 0.5427  [0m 

In [22]:
bo3=BayesianOptimization(f=opt_function, pbounds=pbounds, verbose=2, random_state=47)    
bo3.maximize(init_points=3, n_iter=30)
print(bo3.max)

|   iter    |  target   |   alpha   |    eps    |   gamma   |
-------------------------------------------------------------
| [0m 1       [0m | [0m 5.74    [0m | [0m 0.2021  [0m | [0m 0.09747 [0m | [0m 0.8101  [0m |
| [95m 2       [0m | [95m 6.36    [0m | [95m 0.4163  [0m | [95m 0.07105 [0m | [95m 0.8597  [0m |
| [95m 3       [0m | [95m 7.17    [0m | [95m 0.681   [0m | [95m 0.04205 [0m | [95m 0.7942  [0m |
| [95m 4       [0m | [95m 8.19    [0m | [95m 0.6665  [0m | [95m 0.01628 [0m | [95m 0.7712  [0m |
| [95m 5       [0m | [95m 8.38    [0m | [95m 0.6456  [0m | [95m 0.01384 [0m | [95m 0.7457  [0m |
| [95m 6       [0m | [95m 8.71    [0m | [95m 0.7053  [0m | [95m 0.001   [0m | [95m 0.7016  [0m |
| [95m 7       [0m | [95m 9.12    [0m | [95m 0.6711  [0m | [95m 0.001   [0m | [95m 0.5987  [0m |
| [0m 8       [0m | [0m 8.58    [0m | [0m 0.7774  [0m | [0m 0.001   [0m | [0m 0.5595  [0m |
| [0m 9       [0m | [0m 8.8

In [23]:
bo4 = BayesianOptimization(f=opt_function2, pbounds=pbounds2, verbose=2, random_state=47)    
bo4.maximize(init_points=3, n_iter=30)
print(bo4.max)

|   iter    |  target   |   alpha   |    eps    | eps_decay |   gamma   |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 8.93    [0m | [0m 0.2021  [0m | [0m 0.09747 [0m | [0m 0.683   [0m | [0m 0.546   [0m |
| [0m 2       [0m | [0m 8.73    [0m | [0m 0.7368  [0m | [0m 0.08016 [0m | [0m 0.6164  [0m | [0m 0.5902  [0m |
| [0m 3       [0m | [0m 8.74    [0m | [0m 0.7354  [0m | [0m 0.02542 [0m | [0m 0.3048  [0m | [0m 0.3168  [0m |
| [95m 4       [0m | [95m 8.99    [0m | [95m 0.1997  [0m | [95m 0.07971 [0m | [95m 0.6642  [0m | [95m 0.5726  [0m |
| [0m 5       [0m | [0m 8.83    [0m | [0m 0.2069  [0m | [0m 0.09399 [0m | [0m 0.6782  [0m | [0m 0.5194  [0m |
| [0m 6       [0m | [0m 8.73    [0m | [0m 0.1781  [0m | [0m 0.06616 [0m | [0m 0.8858  [0m | [0m 0.9891  [0m |
| [0m 7       [0m | [0m 8.84    [0m | [0m 0.1914  [0m | [0m 0.1     [0m | [0m 0.6936  [0m | [0m 0.606  