# Installations / Préparations

## install

In [None]:
!pip install importlib-metadata==4.13.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting importlib-metadata==4.13.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Installing collected packages: importlib-metadata
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 6.0.0
    Uninstalling importlib-metadata-6.0.0:
      Successfully uninstalled importlib-metadata-6.0.0
Successfully installed importlib-metadata-4.13.0


In [None]:
!pip install git+https://github.com/osigaud/bbrl_gym

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/osigaud/bbrl_gym
  Cloning https://github.com/osigaud/bbrl_gym to /tmp/pip-req-build-to0piqc7
  Running command git clone --filter=blob:none --quiet https://github.com/osigaud/bbrl_gym /tmp/pip-req-build-to0piqc7
  Resolved https://github.com/osigaud/bbrl_gym to commit 5557075ecd7d4171ac0c21be3c69a94bcae655a9
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting mazemdp>=0.7.3
  Downloading mazemdp-0.7.3-py3-none-any.whl (15 kB)
Collecting swig
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
Collecting gym==0.21.0
  D

In [None]:
!pip install git+https://github.com/osigaud/SimpleMazeMDP

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/osigaud/SimpleMazeMDP
  Cloning https://github.com/osigaud/SimpleMazeMDP to /tmp/pip-req-build-436fyc2s
  Running command git clone --filter=blob:none --quiet https://github.com/osigaud/SimpleMazeMDP /tmp/pip-req-build-436fyc2s
  Resolved https://github.com/osigaud/SimpleMazeMDP to commit 34dee7221967fbfb31be99ae89f04f19c92bd1b2
  Installing build dependencies ... [?25l[?25hdone
[0m  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: mazemdp
  Building wheel for mazemdp (pyproject.toml) ... [?25l[?25hdone
  Created wheel for mazemdp: filename=mazemdp-0.7.4.dev1+g34dee72-py3-none-any.whl size=15533 sha256=eacf030de2deb4e5806ddc28e905ccd72eacb5a841e613320daba5cc723cd797
  Stored in directory: /t

## import

In [None]:
import os
from typing import Tuple, List

import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from mazemdp.toolbox import egreedy, egreedy_loc, softmax, sample_categorical
from mazemdp.maze_plotter import show_videos
from mazemdp.mdp import Mdp
from bbrl_gym.envs.maze_mdp import MazeMDPEnv

# For visualization
os.environ["VIDEO_FPS"] = "5"
if not os.path.isdir("./videos"):
    os.mkdir("./videos")

from IPython.display import Video

import gym
import bbrl_gym

Matplotlib backend: module://ipykernel.pylab.backend_inline


## settings

In [None]:
# Settings
NB_EPISODES = 20
TIMEOUT = 50
MAZE_LARGEUR = 4
MAZE_HAUTEUR = 3
EPSILON = 0.02
TAU = 0.1

## création du maze

In [None]:
env = gym.make("MazeMDP-v0", kwargs={"width": MAZE_LARGEUR, "height": MAZE_HAUTEUR, "ratio": 0.2})
env.reset()

# in dynamic programming, there is no agent moving in the environment
env.init_draw("The maze")

Output()

# Q-Learning 3D

## epsilon-greedy

In [None]:
def calcul_transition(mdp: MazeMDPEnv, but):
    transition_matrix = np.zeros( (mdp.nb_states, mdp.action_space.n, mdp.nb_states) )

    for s in range(mdp.nb_states-1):
        x = mdp.coord_x[s]
        y = mdp.coord_y[s]

        # NORD #
        # si case tout au nord, pas de changement d'état possible dans cette direction
        if x == 0 :
            transition_matrix[s][0][s] = 1.0
        # SUD #
        # si case tout au sud, pas de changement d'état possible dans cette direction
        if x == MAZE_HAUTEUR-1 :
            transition_matrix[s][1][s] = 1.0
        # EST #
        # si case tout à l'est, pas de changement d'état possible dans cette direction
        if y == MAZE_LARGEUR-1 :
            transition_matrix[s][2][s] = 1.0
        # OUEST
        # si case tout à l'ouest, pas de changement d'état possible dans cette direction
        if y == 0 :
            transition_matrix[s][3][s] = 1.0

        for s_prime in range(mdp.nb_states-1):
            # Regarde les relations entre s et les autres états
            if (s != s_prime):
                x_prime = mdp.coord_x[s_prime]
                y_prime = mdp.coord_y[s_prime]
                # NORD #
                if (x_prime == x-1) and (y_prime == y):
                    transition_matrix[s][0][s_prime] = 1.0
                # SUD #
                if (x_prime == x+1) and (y_prime == y):
                    transition_matrix[s][1][s_prime] = 1.0
                # EST #
                if (x_prime == x) and (y_prime == y+1):
                    transition_matrix[s][2][s_prime] = 1.0
                # OUEST #
                if (x_prime == x) and (y_prime == y-1):
                    transition_matrix[s][3][s_prime] = 1.0

        # Si une action n'a aucune transition, alors on boucle sur le même état
        for a in range(mdp.action_space.n):
            if all(transition_matrix[s,a] == 0):
                transition_matrix[s,a,s] = 1.0

    # Transition Matrix of terminal states
    transition_matrix[but, :, :] = 0
    transition_matrix[but, :, -1] = 1

    return transition_matrix

In [None]:
def get_policy_from_q(Q: np.ndarray, but: int) -> np.ndarray:
    # Outputs a policy given the action values
    policy = np.argmax(Q[:, but, :], axis=1)
    return policy

In [None]:
# calcul et stocke les matrices de récompense et de transition pour chaque but
def calcul_goal(mdp: MazeMDPEnv, but: int):
    # calcul du nouveau point d'origine des récompenses
    new_r = np.zeros((mdp.nb_states, mdp.action_space.n))
    new_r[but] = np.ones(mdp.action_space.n)

    # calcul de la nouvelle matrice de transition
    new_P = calcul_transition(mdp, but)

    return new_r, new_P

In [None]:
# modifie la MDP en fonction du goal
def maj_goal(mdp: MazeMDPEnv, but, r_list, P_list):
    # modification de l'état but
    mdp.mdp.r = r_list[but]
    mdp.P = P_list[but]
    mdp.mdp.P = P_list[but]

In [None]:
# --------------------------- Q-Learning epsilon-greedy version -------------------------------#

# Given an exploration rate epsilon, the QLearning algorithm computes the state action-value function
# based on an epsilon-greedy policy
# alpha is the learning rate


def q_learning_eps(
    mdp: MazeMDPEnv,
    epsilon: float,
    nb_episodes: int = 20,
    timeout: int = 50,
    alpha: float = 0.5,
    render: bool = True,
) -> Tuple[np.ndarray, List[float]]:
    # Initialize the state-goal-action value function
    # 3D : états / buts / actions
    Q = np.zeros((mdp.nb_states, mdp.nb_states, mdp.action_space.n))
    r_list = []   # récompenses selon les buts
    P_list = []   # transition selon les buts

    # calcul initial des matrices pour chacun des buts
    for s in range(mdp.nb_states-1):
        new_r, new_P = calcul_goal(mdp, s)
        r_list.append(new_r)
        P_list.append(new_P)
  
    # Run learning cycle
    mdp.set_timeout(timeout)  # episode length

    for _ in tqdm(range(nb_episodes)):
        # Draw the first state of episode i using a uniform distribution over all the states
        s = mdp.reset(uniform=True)
        but = np.random.randint(mdp.nb_states-1)   # tirage de l'état but
        print("but : ", but)

        maj_goal(mdp, but, r_list, P_list)    # modification des matrices correspondante à ce but

        if render:
            mdp.init_draw("Q-learning e-greedy"+str(but))

        done = mdp.mdp.done()

        while not done:
            if render:
                # Show the agent in the maze
                mdp.draw_v_pi(Q[:,but,:], get_policy_from_q(Q, but))

            # Draw an action using an epsilon-greedy policy
            a = egreedy(Q[:, but, :], s, epsilon)

            # Perform a step of the MDP
            [s_prime, r, done, _] = mdp.step(a)

            # Calculs en fonction du but de l'épisode
            delta = r + mdp.gamma * np.max(Q[s_prime, but]) - Q[s, but, a]

            Q[s, but, a] += alpha * delta

            # Update the agent position
            s = s_prime


    if render:
        # Show the final policy
        for but in tqdm(range(mdp.nb_states-1)):
            mdp.current_state = 0
            mdp.terminal_states = [but]
            mdp.draw_v_pi(Q[:,but,:], get_policy_from_q(Q, but), title="Q-learning e-greedy"+str(but))
    return Q, r_list

In [None]:
Q, r_list = q_learning_eps(env, EPSILON, NB_EPISODES, TIMEOUT)

  0%|          | 0/20 [00:00<?, ?it/s]

but :  2


Output()

but :  7
but :  3
but :  8
but :  7
but :  6
but :  4
but :  9
but :  7
but :  4
but :  7
but :  1
but :  2
but :  3
but :  7
but :  0
but :  7
but :  2
but :  4
but :  1


  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
but_test = 4
print(get_policy_from_q(Q, but_test))
show_videos("videos/", "Q-learninge-greedy"+str(but_test))
print(Q[:,but_test,:])

[0 0 2 0 2 0 0 3 0 0 0]
Converting videos/Q-learninge-greedy4.avi


[[0.     0.     0.     0.    ]
 [0.     0.     0.     0.    ]
 [0.     0.     0.225  0.    ]
 [0.     0.     0.     0.    ]
 [0.     0.     0.875  0.    ]
 [0.     0.     0.     0.    ]
 [0.     0.     0.     0.    ]
 [0.     0.     0.     0.3375]
 [0.     0.     0.     0.    ]
 [0.     0.     0.     0.    ]
 [0.     0.     0.     0.    ]]
