## Q-Learning is an off-policy value-based method that uses a Temporal Difference Learning approach to train its action-value function

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import warnings
warnings.filterwarnings('ignore')

### All neeeded imports

In [3]:
import numpy as np
import gym #OPenAI - Taxi Environment
import random # to generate random numbers

### Creating the Tax-Env

In [6]:
env = gym.make('Taxi-v3')
env.render()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+



### Creating the Q-table and initializing it

Parameters *action_size* and *state_size* give info about how many rows (**states**) and columns (**actions**) we need.

In [9]:
action_size = env.action_space.n
state_size = env.observation_space.n

action_size, state_size

(6, 500)

#### Initializing the Qtable with zeros at the start of the training

In [10]:
Qtable = np.zeros((state_size, action_size))
Qtable

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

### Creating hyperparameters

In [11]:
total_episodes = 1000
total_test_episodes = 100
# max steps per episode
max_steps = 99

learning_rate = 0.7
# discounting rate
gamma = 0.6 

# Exploration / Exploitation trade-off
# exploration rate
epsilon = 1.0
# exploration probability at start
epsilon_max = 1.0
# minimum exploration probability 
epsilon_min = 0.01
# exponential decay rate for exploration prob
decay_rate = 0.01

### Q learning algorithm

In [22]:
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0,1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(Qtable[state,:])
        
        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample() # returns the index of the action with the highest Q-value for that state
        
        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        Qtable[state, action] = Qtable[state, action] + learning_rate * (reward + gamma * 
                                    np.max(Qtable[new_state, :]) - Qtable[state, action])
                
        # Our new state is state
        state = new_state
        
        # If done : finish episode
        if done == True: 
            break
    
    # Reduce epsilon (because we need less and less exploration)
    epsilon = epsilon_min + (epsilon_max - epsilon_min)*np.exp(-decay_rate*episode)

### Running Taxi agent on Qtable

After 50 000 episodes, Q-table can be used as a "cheatsheet" to play Taxi.

In [28]:
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    #print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        # UNCOMMENT IT IF YOU WANT TO SEE OUR AGENT PLAYING
        # env.render()
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            print ("Score", total_rewards)
            break
        state = new_state
env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

EPISODE  0
Score 9
EPISODE  1
Score 10
EPISODE  2
Score 7
EPISODE  3
Score 6
EPISODE  4
Score 11
EPISODE  5
Score 6
EPISODE  6
Score 3
EPISODE  7
Score 8
EPISODE  8
Score 11
EPISODE  9
Score 7
EPISODE  10
Score 5
EPISODE  11
Score 10
EPISODE  12
Score 5
EPISODE  13
Score 11
EPISODE  14
Score 5
EPISODE  15
Score 5
EPISODE  16
Score 9
EPISODE  17
Score 6
EPISODE  18
Score 9
EPISODE  19
Score 8
EPISODE  20
Score 9
EPISODE  21
Score 7
EPISODE  22
EPISODE  23
Score 7
EPISODE  24
Score 9
EPISODE  25
Score 9
EPISODE  26
Score 8
EPISODE  27
Score 8
EPISODE  28
Score 9
EPISODE  29
Score 8
EPISODE  30
Score 8
EPISODE  31
EPISODE  32
Score 7
EPISODE  33
Score 7
EPISODE  34
Score 7
EPISODE  35
Score 8
EPISODE  36
Score 10
EPISODE  37
Score 9
EPISODE  38
Score 8
EPISODE  39
Score 10
EPISODE  40
EPISODE  41
Score 10
EPISODE  42
Score 4
EPISODE  43
Score 11
EPISODE  44
Score 8
EPISODE  45
Score 11
EPISODE  46
EPISODE  47
Score 15
EPISODE  48
Score 10
EPISODE  49
Score 11
EPISODE  50
Score 9
EPISODE  