In [1]:
import sys
import random
import numpy as np
from time import sleep
from IPython.display import clear_output

import gym
from taxiv3 import TaxiEnv

## Explore The Environment

In [2]:
env = TaxiEnv()

In [3]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print('State space {}'.format(env.observation_space))
print('Action space {}'.format(env.action_space))

Environment Display:
+---------+
|R:[43m [0m| : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

State Space Discrete(500)
Action Space Discrete(6)


## Explore Encode Process

In [4]:
state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



## Apply Random Approach

In [5]:
def init_var():
    epochs = 0
    penalties, reward = 0, 0
    done = False
    
    return epochs, penalties, reward, done

In [6]:
env.s = 328 # set environment's initial state
frames = [] # for animation

epochs, penalties, reward, done = init_var()

In [7]:
while True:
    while not done:
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        # put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': next_state,
            'action': action,
            'reward': reward
        })

        epochs += 1
        
    if epochs > 200: 
        env.s = 328 # set environment's initial state
        frames = [] # for animation

        epochs, penalties, reward, done = init_var()
    else: break
    
print(f'Timesteps taken: {epochs} steps')
print(f'Penalties incurred: {penalties} penalties')

Timesteps taken: 162 steps
Penalties incurred: 47 penalties


In [8]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i+1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 162
State: 0
Action: 5
Reward: 20


## Create Q-table

In [9]:
Q_table = np.zeros([env.observation_space.n, env.action_space.n])

## Set Hyperparameters

In [10]:
alpha = 0.1
gamma = 0.6
epsilon = 0.1

num_episodes = 100000

## Train The Agent

In [11]:
# for plotting metrics
all_epochs = []
all_penalties = []

for i_episode in range(1, num_episodes+1):
    
    # monitor progress
    if i_episode % 100 == 0:
        print("\rEpisode: {}/{}".format(i_episode, num_episodes), end="")
        sys.stdout.flush()
        
    # get initial state by restarting the environment
    state = env.reset()
    
    epochs = 0
    penalties = 0
    reward = 0
    done = False
    
    while not done:
        
        # apply epsilon-greedy policy
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # explore action space
        else:
            action = np.argmax(Q_table[state]) # exploit learned values
            
        next_state, reward, done, info = env.step(action)
        
        current_value = Q_table[state][action]
        Qsa_next = np.max(Q_table[next_state])
        
        new_value = (1 - alpha) * current_value + (alpha * (reward + gamma * Qsa_next))
        Q_table[state, action] = new_value
        
        if reward == -10:
            penalties += 1
            
        state = next_state
        epochs += 1
        
print("\nTraining finished.")

Episode 100000/100000
Training finished.


## Evaluate The Agent

In [12]:
total_epochs = 0
total_penalties = 0
num_episodes = 100
frame_episodes = {}

In [13]:
for i_episode in range(1, num_episodes+1):
    
    state = env.reset()
    epochs = 0
    penalties = 0
    reward = 0
    
    frames = [] # for animation
    done = False
    
    while not done:
        action = np.argmax(Q_table[state])
        next_state, reward, done, info = env.step(action)
        
        if reward == -10:
            penalties += 1
            
        # put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'episode': i_episode,
            'state': next_state,
            'action': action,
            'reward': reward
        })
        
        state = next_state
        epochs += 1
        
    total_epochs += epochs
    total_penalties += penalties
    frame_episodes[i_episode] = frames
    
print(f'Results after {num_episodes} episodes:')
print(f'Average timesteps per episode: {total_epochs / num_episodes} steps')
print(f'Average penalties per episode: {total_penalties / num_episodes} penalties')

Results after 100 episodes:
Average timesteps per episode: 12.98 steps
Average penalties per episode: 0.0 penalties


In [14]:
for i_episode in range(1, len(frame_episodes)+1):
    
    frames = frame_episodes[i_episode]
    
    def print_frames(frames):
        for i, frame in enumerate(frames):
            clear_output(wait=True)
            print(frame['frame'])
            print(f"Episode: {frame['episode']}/{num_episodes}")
            print(f"Timestep: {i+1}")
            print(f"State: {frame['state']}")
            print(f"Action: {frame['action']}")
            print(f"Reward: {frame['reward']}")
            sleep(.1)
              
    print_frames(frames)

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Episode: 100/100
Timestep: 7
State: 85
Action: 5
Reward: 20


---