<a href="https://colab.research.google.com/github/mohamedyosef101/101_learning_area/blob/area/Reinforcement%20Learning/02_FrozenLake-v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Source**: [Q-learning hands-on](https://huggingface.co/learn/deep-rl-course/unit2/hands-on). huggingface.co

# Forzen Lake
Frozen lake involves crossing a frozen lake from start to goal without falling into any holes by walking over the frozen lake. The player may not always move in the intended direction due to the slippery nature of the frozen lake.

<br>

*More in [Gymnasium Docs](https://gymnasium.farama.org/environments/toy_text/frozen_lake/)*

In [4]:
!pip install gymnasium pygame pyglet==1.5.1 pickle5
!sudo apt-get update
!sudo apt-get install -y python3-opengl
!apt install ffmpeg xvfb
!pip3 install pyvirtualdisplay

Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pickle5
  Building wheel for pickle5 (setup.py) ... [?25l[?25hdone
  Created wheel for pickle5: filename=pickle5-0.0.11-cp310-cp310-linux_x86_64.whl size=255312 sha256=9f2bbe6c51e8fb298f6cb6b5a19209fbacd6bc58267b6b234c2b567a213b357b
  Stored in directory: /root/.cache/pip/wheels/7d/14/ef/4aab19d27fa8e58772be5c71c16add0426acf9e1f64353235c
Successfully built pickle5
Installing collected packages: pickle5
Successfully installed pickle5-0.0.11
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubunt

In [None]:
import os
os.kill(os.getpid(), 9)

In [1]:
# virtual display
from pyvirtualdisplay import Display

vd = Display(visible=0, size=(1400, 900))
vd.start()

<pyvirtualdisplay.display.Display at 0x7f6426fd7f40>

# Import the packages

In [2]:
import numpy as np
import gymnasium as gym
import random
import os

import tqdm
from tqdm.notebook import tqdm

import pickle5 as pikle

# Explore the environment

In [3]:
env = gym.make("FrozenLake-v1",
               map_name="4x4",
               is_slippery=False,
               render_mode="rgb_array")

In [4]:
# observation space
print(f"""Observation Space {env.observation_space}\n
Sample observation {env.observation_space.sample()}
""")

Observation Space Discrete(16)

Sample observation 8



In [5]:
# action space
print(f"""Action space shape {env.action_space.n}\n
Action space sample {env.action_space.sample()}
""")

Action space shape 4

Action space sample 2



**Actions**
* 0: left
* 1: down
* 2: right
* 3: up

**Reward**
- +1: reach goal
- 0: reach hole or frozen

# Create Q-table

In [12]:
# the dimentions of the Q-table
state_space = env.observation_space.n # rows
action_space = env.action_space.n # columns

# define the Q_table_0
def Q_table_0(state_space, action_space):
  Qtable = np.zeros((state_space, action_space))
  return Qtable

# create it for our env
frozenlake_Qtable = Q_table_0(state_space, action_space)

# Define the policies
* Epsilon-greedy policy (acting policy)
* Greedy-policy (updating policy)

In [7]:
# define greedy policy
def greedy_policy(Qtable, state):
  action = np.argmax(Qtable[state][:])
  return action

# define epsilon greedy policy
def epsilon_greedy_policy(Qtable, state, epsilon):
  random_num = random.uniform(0, 1) # number between 0 and 1

  if random_num > epsilon:
    action = greedy_policy(Qtable, state)

  else:
    action = env.action_space.sample()

  return action


# Define the hyperparameters


In [8]:
# Training
n_training_episodes = 10000
lr = 0.7  # learning rate

# Evaluation
n_eval_episodes = 100

# Environment
env_id = "FrozenLake-v1"
max_steps = 99   # Max steps per episode
gamma = 0.95
eval_seed = []

# Exploration (IMPORTANT)
max_epsilon = 1.0 # at start
min_epsilon = 0.05
decay_rate = 0.0005

# Training loop

In [13]:
def train(n_training_episodes,
          max_epsilon, min_epsilon, decay_rate,
          env, max_steps, Qtable):

  for episode in tqdm(range(n_training_episodes)):

    # reduce epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate)

    # rest the environment
    state, info = env.reset()
    step = 0
    terminated = False
    truncated = False

    for step in range(max_steps):

      # take action with epsilon greedy
      action = epsilon_greedy_policy(Qtable, state, epsilon)

      # get the new values
      new_state, reward, terminated, truncated, info = env.step(action)

      # update Qtable
      # Q(s, a) := Q(s, a) + lr [R(s, a) + gamma * max Q(s', a') - Q(s, a)]
      Qtable[state][action] = Qtable[state][action] + lr * (
          reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action]
      )

      if terminated or truncated:
        break

      state = new_state
  return Qtable

# train the agent
agent = train(n_training_episodes,
              max_epsilon, min_epsilon, decay_rate,
              env, max_steps, frozenlake_Qtable)

# see the agent
agent

  0%|          | 0/10000 [00:00<?, ?it/s]

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.73509189, 0.        , 0.81450625, 0.77378094],
       [0.77378094, 0.857375  , 0.77378094, 0.81450625],
       [0.81450625, 0.        , 0.77378094, 0.77378094],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.81450625],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.95      , 0.857375  ],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

# Evaluation

In [14]:
def eval_agent(env, max_steps, n_eval_episodes, Q, seed):
  episode_rewards = []

  for episode in tqdm(range(n_eval_episodes)):

    if seed:
      state, info = env.reset(seed=seed[episode])
    else:
      state, info = env.reset()

    step = 0
    truncated = False
    terminated = False
    total_rewards_ep = 0

    for step in range(max_steps):

      action = greedy_policy(Q, state)
      new_state, reward, terminated, truncated, info = env.step(action)
      total_rewards_ep += reward

      if terminated or truncated:
        break

      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward


# Evaluate our agent
mean_reward, std_reward = eval_agent(env, max_steps,
                                     n_eval_episodes, agent,
                                     eval_seed)
print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")

  0%|          | 0/100 [00:00<?, ?it/s]

Mean reward = 1.00 +/- 0.00
