## TP7 : Reinforcement learning

L'exemple ci dessous simule le problème du taxi sans aucune aide du reinforcement learning : chaque action est réalisé completement aléatoirement sans prendre en compte le reward.

In [124]:
import gym
from time import sleep
from IPython.display import clear_output

# Creating thr env
env = gym.make("Taxi-v2").env

env.s = 328


# Setting the number of iterations, penalties and reward to zero,
epochs = 0
penalties, reward = 0, 0

frames = []

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1

    # Put each rendered frame into the dictionary for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
    }
    )

    epochs += 1

print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

# Printing all the possible actions, states, rewards.
def renderFrames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print("Timesteps taken: {}".format(epochs))
        print("Penalties incurred: {}".format(penalties))
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")  
        sleep(.1)
        
renderFrames(frames)

Timesteps taken: 4696
Penalties incurred: 1544
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (South)

Timestep: 172
State: 448
Action: 0
Reward: -1


KeyboardInterrupt: 

In [44]:
l=[]
l.append(frames[0])
renderFrames(l)

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)

Timestep: 1
State: 328
Action: 3
Reward: -1


In [42]:
print(env)

<TaxiEnv<Taxi-v2>>


In [None]:
env.P[450]

In [133]:
import gym
import numpy as np
import random
from IPython.display import clear_output

# Init Taxi-V2 Env
env = gym.make("Taxi-v2").env

# Init arbitrary values
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

for i in range(1, 100001):
    state = env.reset()

    # Init Vars
    epochs, penalties, reward, = 0, 0, 0
    done = False

    while not done:
        if random.uniform(0, 1) < epsilon:
            # Check the action space
            action = env.action_space.sample()
        else:
            # Check the learned values
            action = np.argmax(q_table[state])

        next_state, reward, done, info = env.step(action)

        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        # Update the new value
        new_value = (1 - alpha) * old_value + alpha * \
            (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

    if i % 100 == 0:
        clear_output(wait=True)
        print("Episode:",i)
        print("Epochs:",epochs)
        print("Penalties:",penalties)
        print("Reward:",reward)

print("Training finished.")


Episode: 100000
Epochs: 14
Penalties: 0
Reward: 20
Training finished.


### Application

We will test manually our q-learned model by using the code from before

In [153]:
# Creating thr env
env = gym.make("Taxi-v2").env

#env.s = 328
state=env.reset()


# Setting the number of iterations, penalties and reward to zero,
epochs = 0
penalties, reward = 0, 0

frames = []

done = False

while not done:
    action = np.argmax(q_table[state])
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1

    # Put each rendered frame into the dictionary for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
    }
    )

    epochs += 1

renderFrames(frames)

Timesteps taken: 13
Penalties incurred: 0
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 13
State: 0
Action: 5
Reward: 20


In [143]:
print(q_table[328])

[ -2.41120978  -2.27325184  -2.39520254  -2.35629281 -10.51375912
 -10.60841848]


### Experiments

The first learning is very greedy.

q_table[328] = [ -2.41120978  -2.27325184  -2.39520254  -2.35629281 -10.51375912
 -10.60841848].

Trying to increase the randomness (epsilon) : the process is more longer but it works fine in the end.

q_table[328] = [ -2.41837066  -2.27325184  -2.41837066  -2.3639511  -11.3639511
 -11.3639511 ].
 
All our tests seems to work fine because we are doing 100 000 iterations on the q_table



### SARSA algorithm for the taxi problem

A good policy will be to have a high exploration in the beginning, and then reduce the epsilon to maximize the rewards as we have some knowledge of the q-table results.