# Q-Learning with Taxi-v3 🚕

Resource: [Deep Reinforcement Learning Course](https://simoninithomas.github.io/deep-rl-course/)

### Import the libraries 



In [1]:
import numpy as np
import gym
import random

##  Creating the environment 


In [2]:
env = gym.make("Taxi-v3")
env.render()

+---------+
|R: | :[43m [0m:[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



## Creating the Q-table and initializing it 🗄️


In [4]:
state_space = env.observation_space.n
print ("There are", state_space, "states")
action_space = env.action_space.n
print ("There are", action_space, "actions")

There are 500 states
There are 6 actions


In [19]:
Q_table = np.zeros((state_space, action_space))
print (Q_table.shape)
print (Q_table)

(500, 6)
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


## Defining the hyperparameters 

In [20]:
total_episodes = 15000
total_test_episodes = 100
max_steps = 200
learning_rate = 0.01
discount_factor = 0.99
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.001
decay_rate = 0.01

## Defining the epsilon-greedy policy 


In [21]:
def epsilon_greedy_policy(Q_table, state):
  if(random.uniform(0,1) > epsilon):
    action = np.argmax(Q_table[state])
  else:
    action = env.action_space.sample()
  
  return action

## Step 5: Define the Q-Learning algorithm and train our agent 🧠
- Now we implement the Q learning algorithm:
[Q-Learning]

In [22]:
 for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        action = epsilon_greedy_policy(Q_table, state)
        new_state, reward, done, info = env.step(action)
        Q_table[state][action] = Q_table[state][action] + learning_rate * (reward + discount_factor * np.max(Q_table[new_state]) - Q_table[state][action])      
        if done == True: 
            break
        
        state = new_state
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

In [23]:
Q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-3.33119602, -3.32356226, -3.32286057, -3.32228315,  9.3857871 ,
        -3.34377937],
       [-1.50807608, -1.62612553, -1.62809824, -1.6262152 , 14.08941466,
        -1.67015063],
       ...,
       [-0.73710889, -0.6967866 , -0.74550559, -0.73963433, -0.79147751,
        -0.78123045],
       [-2.19554925, -2.20069545, -2.19559269, -2.20327494, -2.28915048,
        -2.28559163],
       [-0.0199    , -0.0199    , -0.029701  ,  0.1591912 , -0.1       ,
        -0.29701   ]])

## Run taxi agent.




In [25]:
import time
rewards = []

frames = []
for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("****************************************************")
    print("EPISODE ", episode)
    for step in range(max_steps):
        env.render()     
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Q_table[state][:])
        new_state, reward, done, info = env.step(action)
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            #print ("Score", total_rewards)
            break
        state = new_state
env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+
  (West)
+---------+
|[