In [1]:
import numpy as np
import gym
import random
import imageio
import os

import pickle5 as pickle
from tqdm.notebook import tqdm

## Frozen Lake Environment

In [2]:
# Create the FrozenLake-v1 environment using 4x4 map and non-slippery version
env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=False)
print(env.observation_space)
print(env.action_space)

Discrete(16)
Discrete(4)


In [3]:
n_states = env.observation_space.n
print("There are ", n_states, " possible states")

n_actions = env.action_space.n
print("There are ", n_actions, " possible actions")

There are  16  possible states
There are  4  possible actions


In [4]:
import Q_learning_simple_utils as q_utils

Qtable_frozenlake = q_utils.initialize_q_table(n_states, n_actions)

In [5]:
# Training parameters
n_training_episodes = 10000  # Total training episodes
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
env_id = "FrozenLake-v1"     # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability 
decay_rate = 0.0005            # Exponential decay rate for exploration prob

In [6]:
Qtable_frozenlake = q_utils.train_q_learning(n_training_episodes, min_epsilon, max_epsilon, decay_rate, learning_rate, gamma, env, max_steps, Qtable_frozenlake)

100%|██████████| 10000/10000 [00:01<00:00, 5564.21it/s]


In [7]:
Qtable_frozenlake

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.77378094, 0.77378094],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [8]:
# Evaluate our Agent
mean_reward, std_reward = q_utils.evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

100%|██████████| 100/100 [00:00<00:00, 7092.16it/s]

Mean_reward=1.00 +/- 0.00





## Taxi Environment

In [9]:
env = gym.make("Taxi-v3")

In [10]:
n_states = env.observation_space.n
print("There are ", n_states, " possible states")

n_actions = env.action_space.n
print("There are ", n_actions, " possible actions")

There are  500  possible states
There are  6  possible actions


In [11]:
Qtable_taxi = q_utils.initialize_q_table(n_states, n_actions)

In [15]:
# Training parameters
n_training_episodes = 25000   # Total training episodes
learning_rate = 0.7           # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# DO NOT MODIFY EVAL_SEED
eval_seed = [] 
# Environment parameters
env_id = "Taxi-v3"           # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05           # Minimum exploration probability 
decay_rate = 0.005            # Exponential decay rate for exploration prob


In [13]:
Qtable_taxi = q_utils.train_q_learning(n_training_episodes, min_epsilon, max_epsilon, decay_rate, learning_rate, gamma, 
                                        env, max_steps, Qtable_taxi)

100%|██████████| 25000/25000 [00:09<00:00, 2645.39it/s]


In [16]:
# Evaluate our Agent
mean_reward, std_reward = q_utils.evaluate_agent(env, max_steps, n_eval_episodes, Qtable_taxi, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

100%|██████████| 100/100 [00:00<00:00, 3469.89it/s]

Mean_reward=7.76 +/- 2.53



