# Tabular RL Agents

Implementations of (some) tabular RL agents from Sutton and Barto ***RL: An Introduction***

In [1]:
import sys

In [3]:
import gym
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from numpy import random
from time import sleep

from agents import Sarsa, QLearn, LambdaQLearn, DynaQLearn

In [3]:
env = gym.make("Taxi-v3")
agent = DynaQLearn(env)

AttributeError: module 'gym' has no attribute 'make'

In [19]:
train_frames = []
episodes = 10000
agent.epsilon = 0.1

for i in range(episodes):
    agent.reset_e()
    state = env.reset()
    done=False
    action = agent.act(state)
    
    
    while not done:        
        next_state, reward, done, info = env.step(action)
        next_action = agent.act(next_state)
        agent.update(state,action,reward,next_state,next_action)
        agent.plan(n=10)

        train_frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            })
        
        state = next_state
        action = next_action

In [72]:
%matplotlib inline

for frame in train_frames[-1000:]:
    
    display.clear_output(wait=True)
    print(frame['frame']) # just update the data
    print(frame['reward'])
    print(frame['state'])

    sleep(.1)


+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)

-1
308


In [20]:
#Evaluate the agent

agent.epsilon = 0 #make it greedy

total_epochs, total_penalties = 0, 0
episodes = 100
test_frames = []

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = agent.act(state)
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1
        
        test_frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            })

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 13.37
Average penalties per episode: 0.0


In [17]:
%matplotlib inline


for frame in test_frames[-1000:]:
    
    display.clear_output(wait=True)
    print(frame['frame']) # just update the data
    print(frame['reward'])
    print(frame['state'])

    sleep(.1)


+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

20
85
