In [1]:
import numpy as np
from time import sleep
from IPython.display import clear_output

import gym
from taxiv2 import TaxiEnv

from agent import Agent
from monitor import interact

## Explore The Environment

In [2]:
env = TaxiEnv()

In [3]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print("State Space {}".format(env.observation_space))
print("Action Space {}".format(env.action_space))

Environment Display:
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

State Space Discrete(500)
Action Space Discrete(6)


## Train The Agent

In [4]:
agent = Agent()
avg_rewards, best_avg_reward = interact(env, agent)

Episode 100000/100000 || Best average reward 5.07



## Evaluate The Agent

In [5]:
total_epochs = 0
total_penalties = 0
num_episodes = 100
frame_episodes = {}

In [6]:
for i_episode in range(1, num_episodes+1):
    
    state = env.reset()
    epochs = 0
    penalties = 0
    reward = 0
    
    done = False
    
    while not done:
        action = np.argmax(agent.Q_table[state])
        next_state, reward, done, info = env.step(action)
        
        if reward == -10:
            penalties += 1
            
        state = next_state
        epochs += 1
        
    total_epochs += epochs
    total_penalties += penalties
    
print(f'Results after {num_episodes} episodes:')
print(f'Average timesteps per episode: {total_epochs / num_episodes} steps')
print(f'Average penalties per episode: {total_penalties / num_episodes} penalties')

Results after 100 episodes:
Average timesteps per episode: 12.87 steps
Average penalties per episode: 0.0 penalties


---