In [4]:
import numpy as np
import gym

In [5]:
env = gym.make('Taxi-v2')

In [6]:
env.render()

+---------+
|[34;1m[43mR[0m[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [7]:
state_size = env.observation_space.n
action_size = env.action_space.n
(state_size, action_size)

(500, 6)

In [8]:
qtable = np.zeros((state_size, action_size))
qtable

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [9]:
episodes_count = 10000
lr = 0.8                      # learning rate
max_steps_count = 100         # Max steps the game should run or else end the session
gamma = 0.95                  # Discount rate (on future reward)

# Exploration Parameters
max_epsilon = 1.0
min_epsilon = 0.01             
epsilon = 1.0                 # Initial epsilon (Exploration probability)
decay_rate = 0.005            # decay rate for epsilon (exponentially decrease exploration)

In [10]:
rewards = [] # List of total reward in each episode

for episode in range(episodes_count):
    state = env.reset()
    episode_reward_sum = 0
    
    for step in range(max_steps_count):
        action = None
        
        exploration_prob = np.random.uniform()
        
        if exploration_prob <= epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(qtable[state, :])
        
        new_state, reward, done, info = env.step(action)

        qtable[state, action] = qtable[state, action] + \
                                    lr * (reward + (gamma * np.max(qtable[new_state, :])) - qtable[state, action])
        
        state = new_state
        episode_reward_sum += reward
        
        # if current game session has finished (won or dead)
        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(episode_reward_sum)

In [11]:
print(qtable)
print(sum(rewards)/ episodes_count)

[[  0.           0.           0.           0.           0.
    0.        ]
 [107.86866464 247.87139419 220.17698769 254.31608818 273.30166436
  239.18969326]
 [239.31672453 272.59026531 253.56534682 288.58278685 304.98799375
  272.66613749]
 ...
 [ -1.7216      -1.607936    -2.2217728  285.01185394  -9.6
   -9.06548012]
 [ -4.34484421 186.07849892  -4.54570539  -4.74953796 -12.76566217
  -12.94408036]
 [ -1.7216      46.85092761  -1.568      365.20904765  -8.
  -10.208     ]]
1.076


In [15]:
state = env.reset()
step = 0
done = False

for step in range(max_steps_count):

    # Take the action (index) that have the maximum expected future reward given that state
    action = np.argmax(qtable[state,:])
    env.render()     
    new_state, reward, done, info = env.step(action)
    if done:
        # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
        env.render()

        # We print the number of step it took.
        print("Number of steps", step)
        break
    state = new_state


+---------+
|[43mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

+---------+
|R: | : :[35mG[0m|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[42mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| :[42m_[0

In [None]:
env.close()