In [1]:
import gym
import numpy as np
from collections import defaultdict, deque

In [2]:
class agent():
    def __init__(self, action_size, epsilon = 0.4, alpha = 0.3, alphaChange = 1, minAlpha = 0.3):
        self.action_size = action_size
        self.epsilon = epsilon
        self.alpha = alpha
        self.alpha_change = alphaChange
        self.alpha_min = minAlpha
        self.Q = defaultdict(lambda: np.zeros(self.action_size))
        self.gamma = 1.0
        self.gamma_min = 0.1
        self.gamma_change = 0.995
        self.gamma_change_interval = 5000

        self.adjustAlpha = lambda: self.alpha_min if self.alpha*self.alpha_change> self.alpha_min else self.alpha*self.alpha_change
        self.adjustGamma = lambda: self.gamma_min if self.gamma*self.gamma_change > self.gamma_min else self.gamma*self.gamma_change
        
    def selectAction(self, state, episodeNr):
        eps =self.epsilon/episodeNr
        policy = np.ones(self.action_size)*eps/self.action_size
        max_action = np.argmax(self.Q[state])
        policy[max_action] = 1 - eps + eps/self.action_size
        action_choosen = np.random.choice(np.arange(self.action_size), p=policy)
        return action_choosen, policy
    
    def stepSARSA(self, state, action, reward, next_state, done, episodeNr):
        if done == True:
            self.Q[state][action] = self.Q[state][action] + self.alpha*(reward - self.Q[state][action])
            if episodeNr == self.gamma_change_interval:
                self.gamma = self.adjustGamma()
            return
        
        next_action, _ = self.selectAction(next_state, episodeNr)
        self.Q[state][action] = self.Q[state][action] + self.alpha*(reward + self.gamma*self.Q[next_state][next_action] - self.Q[state][action])
        #self.alpha = self.adjustAlpha()
        if episodeNr == self.gamma_change_interval:
            self.gamma = self.adjustGamma()
        return
    
    def stepSARSAMax(self, state, action, reward, next_state, done, episodeNr):
        if done == True:
            self.Q[state][action] = self.Q[state][action] + self.alpha*(reward-self.Q[state][action])
            if episodeNr == self.gamma_change_interval:
                self.gamma = self.adjustGamma()
            return
        
        self.Q[state][action] = self.Q[state][action] + self.alpha*(reward + self.gamma*self.Q[next_state][np.argmax(self.Q[next_state])] - self.Q[state][action])
        #self.alpha = self.adjustAlpha()
        if episodeNr == self.gamma_change_interval:
            self.gamma = self.adjustGamma()
        return
    
    def stepExpectedSARSA(self, state, action, reward, next_state, done, episodeNr):
        if done == True:
            self.Q[state][action] = self.Q[state][action] + self.alpha*(reward-self.Q[state][action])
            if episodeNr == self.gamma_change_interval:
                self.gamma = self.adjustGamma()
            return
        
        _, policy = self.selectAction(next_state, episodeNr)
        
        expectedTab = np.dot(policy, self.Q[next_state])
        self.Q[state][action] = self.Q[state][action] + self.alpha*(reward + self.gamma*expectedTab - self.Q[state][action])
        #self.alpha = self.adjustAlpha()
        if episodeNr == self.gamma_change_interval:
            self.gamma = self.adjustGamma()
        return

In [4]:
env = gym.make('Taxi-v2')
Agent = agent(env.action_space.n)

results = deque(maxlen=100)
desired_avr = 9.7
best_avr = -10000
episode = 0

while True:
    episode += 1
    state = env.reset()
    episode_reward = 0
    while True:
        action, _ = Agent.selectAction(state, episode)
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        Agent.stepExpectedSARSA(state, action, reward, next_state, done, episode)
        if done == True:
            results.append(episode_reward)
            break
        state = next_state
    if np.mean(results) > best_avr:
        best_avr = np.mean(results)
    if episode >=100 and episode%5000 == 0:
        print(episode, " average reward = ", np.mean(results), " best avr reward = ", best_avr)
        
    if episode >=100 and np.mean(results) >=9.3:
        print("ended in ", episode)
        break
    

5000  average reward =  8.16  best avr reward =  9.16
10000  average reward =  8.49  best avr reward =  9.22
ended in  12231
