In [4]:
%config IPCompleter.greedy=True

In [5]:
import numpy as np
import gym
import random

In [6]:
environment = gym.make("Taxi-v2")

In [7]:
environment.render()

+---------+
|R: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



In [8]:
# Number of possible states
state_size = environment.observation_space.n

# Number of possible actions per state
action_size = environment.action_space.n

In [9]:
qtable = np.zeros((state_size, action_size))

In [10]:
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [11]:
total_episodes = 15000
learning_rate = 0.8
max_steps = 99
gamma = 0.95
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

In [15]:
rewards = []

for episodes in range(total_episodes):
    state = environment.reset()
    step = 0
    done = False
    total_rewards = 0
    for step in range(max_steps):
        exploration_exploitation_tradeoff = random.uniform(0, 1)

        if exploration_exploitation_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
        else:
            action = environment.action_space.sample()

        new_state, reward, done, info = environment.step(action)
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * \
                                            np.max(qtable[new_state, :]) - qtable[state, action])
        total_rewards += reward
        state = new_state
        if done == True:
            break
        
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
        rewards.append(total_rewards)


print(f"Score over time: {sum(rewards) / total_episodes}")
print(qtable)

Score over time: -18804.8936
[[  0.           0.           0.           0.           0.
    0.        ]
 [242.61561839 256.58771444 242.73168396 256.57900724 271.14502064
  247.36629252]
 [268.08500106 283.26796412 268.09423842 283.26680899 299.22970621
  274.25614966]
 ...
 [302.33562274 319.3052132  302.31647511 286.21402016 293.33724536
  293.33907302]
 [239.84865472 253.62523779 239.90868255 253.5336478  230.87442093
  230.84125098]
 [355.8700959  336.976636   355.99983472 375.89972016 346.87865364
  346.97410924]]


In [16]:
environment.reset()
print(environment.step(0))

(467, -1, False, {'prob': 1.0})


In [18]:
environment.reset()

for episode in range(5):
    state = environment.reset()
    
    step = 0
    
    done = False
    
    print("----------------------------------------------------")
    print("EPISODE ", episode)
    
    for step in range(max_steps):
        
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = environment.step(action)
        
        if done:
            environment.render()
            print("Number of steps", step)
            break
        state = new_state

environment.close()

----------------------------------------------------
EPISODE  0
+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps 13
----------------------------------------------------
EPISODE  1
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
Number of steps 13
----------------------------------------------------
EPISODE  2
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps 10
----------------------------------------------------
EPISODE  3
+---------+
|[35m[42mR[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps 14
----------------------------------------------------
EPISODE  4
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
Number of steps 11
