In [1]:
import numpy as np
import gym

In [2]:
env = gym.make("Taxi-v2")
env.render()

+---------+
|[34;1mR[0m: | :[43m [0m:[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [5]:
action_size = env.action_space.n
print("Action size: {}".format(action_size))

state_size = env.observation_space.n
print("State size: {}".format(state_size))

Action size: 6
State size: 500


In [6]:
# initialize q-table
qtable = np.zeros((state_size, action_size))

# hyperparameters
total_episodes = 50000
total_test_episodes = 100
max_steps = 99

learning_rate = 0.7
gamma = 0.618

epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

In [10]:
def exploit(eps):
    """Randomizes a number to select
    whether or not to expolit"""
    return np.random.uniform() > eps

In [13]:
for episode in range(total_episodes):
    # reset at the beginning of every episode
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        # randomize a number to select whether to 
        # explore or exploit
        if exploit(epsilon):
            # select the action with highest 
            # q value
            action = np.argmax(qtable[state, :])
        else:
            # explore, take random action
            action = env.action_space.sample()
        # perform the action and see the outcome
        new_state, reward, done, info = env.step(action)
        # update Q-Table 
        qtable[state, action] += learning_rate \
                * (reward + gamma*np.max(qtable[new_state, :])
                - qtable[state, action])
        # update new state
        state = new_state
        
        # if end of the episode, then break
        if done:
            break
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate*episode)

In [15]:
# play the game
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    total_rewards = 0
    print("*" * 100)
    print("Episode {}".format(episode))
    for step in range(max_steps):
        env.render()
        # Take action which has the highest q value 
        # in the current state
        action = np.argmax(qtable[state, :])
        new_state, reward, done, info = env.step(action)
        total_rewards += reward
        if done:
            rewards.append(total_rewards)
            print("Score", total_rewards)
            break
        state = new_state
env.close()
print("Average Score", sum(rewards) / total_test_episodes)

****************************************************************************************************
Episode 0
+---------+
|[35mR[0m: | : :G|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

+---------+
|[35mR[0m: | : :G|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : 

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : :[42m_[0m|
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : : : :[42m_[0m|
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
Score 8
****************************************************************************************************
Episode 57
+---------+
|R: | : :G|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+

+---------+
|R: | : :G|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : |