# Tabular RL Agents

Implementations of (some) tabular RL agents from Sutton and Barto ***RL: An Introduction***

In [1]:
import sys

In [2]:
import gym
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from numpy import random
from time import sleep

In [14]:
'''
Agents
'''


class Agent():
    def act(self, state):
        pass
    def update(self, state, action, reward, next_state, next_action):
        pass
    def reset_e(self):
        pass
    def plan(self, n=0):
        pass
    def reset_q(self):
        pass



class Sarsa(Agent):
    
    def __init__(self, env, params = None):
        
        if params is None:
            params = {}
        
        self.env = env
        self.epsilon = params.get('epsilon', 0.1) #Exploration probability
        self.alpha = params.get('alpha', 0.01) # Learning rate
        self.gamma = params.get('gamma', 1) # Discount rate
        
        self.q_matrix = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        
        
    def update(self,state,action,reward,next_state,next_action):
        
        delta = (reward + self.gamma*self.q_matrix[next_state,next_action] - self.q_matrix[state,action])
        self.q_matrix[state,action] = self.q_matrix[state,action] + self.alpha*delta
        
    
    def act(self,state):
        
        if random.uniform(0,1)<self.epsilon:
            return env.action_space.sample()

        else:
            return np.argmax(self.q_matrix[state])     
        
class QLearn(Agent):
    
    def __init__(self, env, params = None):
        
        if params is None:
            params = {}
        
        self.env = env
        self.epsilon = params.get('epsilon', 0.1) #Exploration probability
        self.alpha = params.get('alpha', 0.01) # Learning rate
        self.gamma = params.get('gamma', 1) # Discount rate
        
        self.q_matrix = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        
        
    def update(self,state,action,reward,next_state,next_action):
        
        delta = (reward + self.gamma*np.max(self.q_matrix[next_state]) - self.q_matrix[state,action])
        self.q_matrix[state,action] = self.q_matrix[state,action] + self.alpha*delta
        
    
    def act(self,state):
        
        if random.uniform(0,1)<self.epsilon:
            return env.action_space.sample()

        else:
            return np.argmax(self.q_matrix[state]) 
        
class LambdaQLearn(Agent):
    
    def __init__(self, env, params=None):
        
        if params is None:
            params = {}
        
        self.env = env
        self.epsilon = params.get('epsilon', 0.1) #Exploration probability
        self.alpha = params.get('alpha', 0.01) # Learning rate
        self.gamma = params.get('gamma', 1) # Discount rate
        self.lambda_ = params.get('lambda',0.5) # Eligibility decay
        
        self.q_matrix = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        self.e_matrix = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        
        
    def update(self,state,action,reward,next_state,next_action):
        
        if self.q_matrix[next_state, next_action] == np.max(self.q_matrix[next_state]):
            max_a = next_action
        else:
            max_a = np.argmax(self.q_matrix[next_state])
        
        delta = (reward + self.gamma*self.q_matrix[next_state, max_a] - self.q_matrix[state, action])
        self.e_matrix[state, action] += 1
        
        self.q_matrix += self.alpha*delta*self.e_matrix
        
        if max_a == next_action:
            self.e_matrix *= self.gamma*self.lambda_
        else:
            self.reset_e()
        
    
    def act(self,state):
        
        if random.uniform(0,1)<self.epsilon:
            return env.action_space.sample()

        else:
            return np.argmax(self.q_matrix[state]) 
        
    def reset_e(self):
        self.e_matrix = np.zeros(self.e_matrix.shape)
        
class DynaQLearn(Agent):
    
    def __init__(self, env, params = None):
            
        if params is None:
            params = {}
            
        self.env = env
        self.epsilon = params.get('epsilon', 0.1) #Exploration probability
        self.alpha = params.get('alpha', 0.01) # Learning rate
        self.gamma = params.get('gamma', 1) # Discount rate
        
        self.q_matrix = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        
        self.model = np.zeros([self.env.observation_space.n, self.env.action_space.n, 2]) # extra 2 dimensions (R, S')
        self.model_bool = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        
        
    def update(self,state,action,reward,next_state,next_action):
        
        delta = (reward + self.gamma*np.max(self.q_matrix[next_state]) - self.q_matrix[state,action])
        self.q_matrix[state,action] += self.alpha*delta
        
        self.model_bool[state, action] = 1
        self.model[state, action, :] = reward, next_state 
        
    
    def act(self, state):
        
        if random.uniform(0,1)<self.epsilon:
            return env.action_space.sample()

        else:
            return np.argmax(self.q_matrix[state]) 
        
    def plan(self, n=1):
             
        if (self.model_bool.flatten().sum() > n):
            for _ in range(n):
                nonzero = np.transpose(np.nonzero(self.model_bool))
                pstate, paction = nonzero[np.random.choice(nonzero.shape[0])]
                preward, pnext_state = self.model[pstate, paction]

                pnext_state = int(pnext_state)

                pdelta = (preward + self.gamma*np.max(self.q_matrix[pnext_state]) - self.q_matrix[pstate,paction])
                self.q_matrix[pstate, paction] +=  self.alpha*pdelta
        
        
        

In [3]:
env = gym.make("Taxi-v3")
agent = DynaQLearn(env)

AttributeError: module 'gym' has no attribute 'make'

In [19]:
train_frames = []
episodes = 10000
agent.epsilon = 0.1

for i in range(episodes):
    agent.reset_e()
    state = env.reset()
    done=False
    action = agent.act(state)
    
    
    while not done:        
        next_state, reward, done, info = env.step(action)
        next_action = agent.act(next_state)
        agent.update(state,action,reward,next_state,next_action)
        agent.plan(n=10)

        train_frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            })
        
        state = next_state
        action = next_action

In [72]:
%matplotlib inline

for frame in train_frames[-1000:]:
    
    display.clear_output(wait=True)
    print(frame['frame']) # just update the data
    print(frame['reward'])
    print(frame['state'])

    sleep(.1)


+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)

-1
308


In [20]:
#Evaluate the agent

agent.epsilon = 0 #make it greedy

total_epochs, total_penalties = 0, 0
episodes = 100
test_frames = []

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = agent.act(state)
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1
        
        test_frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': action,
            'reward': reward
            })

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 13.37
Average penalties per episode: 0.0


In [17]:
%matplotlib inline


for frame in test_frames[-1000:]:
    
    display.clear_output(wait=True)
    print(frame['frame']) # just update the data
    print(frame['reward'])
    print(frame['state'])

    sleep(.1)


+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

20
85
