In [5]:
import gym
import numpy as np
import optuna


# Define environment
env = gym.make('FrozenLake-v1')


def epsilon_greedy(q_table, state, epsilon):
    """
    Choose an action using the epsilon-greedy policy
    """
    if np.random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[int(state), :])
    return action


def sarsa(q_table, epsilon, discount_factor, epsilon_decay, max_episodes, max_timesteps):
    """
    SARSA algorithm for Q-learning
    """
    rewards = []
    for i_episode in range(max_episodes):
        state = env.reset()
        action = epsilon_greedy(q_table, state, epsilon)
        done = False
        episode_reward = 0
        for t in range(max_timesteps):
            next_state, reward, done, _ = env.step(action)
            next_action = epsilon_greedy(q_table, next_state, epsilon)
            # Update Q table
            q_table[int(state), action] += \
                epsilon * (reward + discount_factor * q_table[int(next_state), next_action] -
                           q_table[int(state), action])
            state = next_state
            action = next_action
            episode_reward += reward
            if done:
                break
        # Decay epsilon
        epsilon = max(epsilon * epsilon_decay, 0.01)
        rewards.append(episode_reward)
    return q_table, rewards


def objective(trial):
    """
    Define objective function for Optuna hyperparameter tuning
    """
    # Sample hyperparameters
    discount_factor = trial.suggest_float('discount_factor', 0.1, 0.99)
    epsilon_decay = trial.suggest_float('epsilon_decay', 0.1, 0.99)
    epsilon_min = trial.suggest_float('epsilon_min', 0.01, 0.1)
    epsilon_max = trial.suggest_float('epsilon_max', 0.5, 1.0)
    n_episodes = trial.suggest_int('n_episodes', 100, 1000)
    max_timesteps = trial.suggest_int('max_timesteps', 100, 1000)

    # Initialize Q table and hyperparameters
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    epsilon = epsilon_max

    # Train agent using SARSA algorithm
    _, rewards = sarsa(q_table, epsilon, discount_factor, epsilon_decay, n_episodes, max_timesteps)

    # Return the average of last 100 rewards as objective value
    return np.mean(rewards[-100:])


# Run hyperparameter tuning for 50 trials
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print best hyperparameters and objective value
best_params = study.best_params
best_obj = study.best_value
print(f"Best hyperparameters: {best_params}")
print(f"Best objective value: {best_obj}")


[32m[I 2023-04-27 14:44:28,095][0m A new study created in memory with name: no-name-bc47ceb9-2811-4656-b62c-61311921b843[0m
[33m[W 2023-04-27 14:44:28,097][0m Trial 0 failed with parameters: {'discount_factor': 0.5545461807149378, 'epsilon_decay': 0.7816670734093348, 'epsilon_min': 0.04952111898211908, 'epsilon_max': 0.6696693140716973, 'n_episodes': 377, 'max_timesteps': 580} because of the following error: TypeError("int() argument must be a string, a bytes-like object or a number, not 'tuple'").[0m
Traceback (most recent call last):
  File "/Applications/anaconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/q4/72ybtynd5sjf5ppl_5vpsw080000gn/T/ipykernel_79363/3134833353.py", line 66, in objective
    _, rewards = sarsa(q_table, epsilon, discount_factor, epsilon_decay, n_episodes, max_timesteps)
  File "/var/folders/q4/72ybtynd5sjf5ppl_5vpsw080000gn/T/ipykernel_79363/3134833353.py", line 2

TypeError: int() argument must be a string, a bytes-like object or a number, not 'tuple'