In [1]:
import numpy as np
from collections import defaultdict
from monitor import interact
import gym

In [4]:
class Agent:

    def __init__(self, nA=6):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.epsilon = 0.002
        self.alpha = 0.18
        self.gamma = 0.8
        self.prob = []

    def select_action(self, state):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        policy_prob = np.ones(self.nA) * self.epsilon / self.nA
        best_a = np.argmax(self.Q[state])
        policy_prob[best_a] = 1 - self.epsilon + self.epsilon / self.nA
        self.prob = policy_prob
        return np.random.choice(np.arange(self.nA), p=policy_prob)

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        if not done:
            self.Q[state][action] = self.Q[state][action] + self.alpha * (reward + self.gamma * np.max(self.Q[next_state]) 
                                                                          - self.Q[state][action])
        else:
            self.Q[state][action] = self.Q[state][action] + self.alpha * (reward + 0 - self.Q[state][action])

In [5]:
env = gym.make('Taxi-v2')
agent = Agent()
avg_rewards, best_avg_reward = interact(env, agent)

Episode 20000/20000 || Best average reward 9.056

