In [61]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Practical for course 'Reinforcement Learning',
Leiden University, The Netherlands
2022
By Thomas Moerland
"""
import matplotlib
#%matplotlib ipympl
matplotlib.use('Qt5Agg') # or TkAgg

import numpy as np
from Environment import StochasticWindyGridworld
from Helper import argmax

class QValueIterationAgent:
    ''' Class to store the Q-value iteration solution, perform updates, and select the greedy action '''

    def __init__(self, n_states, n_actions, gamma, threshold=0.01):
        self.n_states = n_states
        self.n_actions = n_actions
        self.gamma = gamma
        self.Q_sa = np.zeros((n_states,n_actions))
        
    def select_action(self,s):
        ''' Returns the greedy best action in state s ''' 
        a =  argmax(np.array([self.Q_sa[s,x] for x in range(self.n_actions)]))
        return a
        
    def update(self,s,a,p_sas,r_sas):
        ''' Function updates Q(s,a) using p_sas and r_sas '''
        self.Q_sa[s,a] = np.sum([p_sas[state]*(r_sas[state] 
            + self.gamma*np.max([self.Q_sa[state,action] for action in range(self.n_actions)]))
            for state in range(self.n_states)])
        pass
    
    
def Q_value_iteration(env, gamma=1.0, threshold=0.001):
    ''' Runs Q-value iteration. Returns a converged QValueIterationAgent object '''
    
    QIagent = QValueIterationAgent(env.n_states, env.n_actions, gamma) 
    max_error = np.inf
    i = 0
    
    while max_error >= threshold:
        i += 1
        max_error = 0
        for s in range(QIagent.n_states):
            for a in range(QIagent.n_actions):
                p_sas , r_sas = env.model(s,a) 
                x = QIagent.Q_sa[s,a]
                QIagent.update(s,a,p_sas,r_sas)
                max_error = np.max([max_error,np.absolute(x - QIagent.Q_sa[s,a])])        
    #'Plot current Q-value estimates & print max error'
        env.render(Q_sa=QIagent.Q_sa,plot_optimal_policy=True,step_pause=3)
        print("Q-value iteration, iteration {}, max error {}".format(i,max_error))
     
    return QIagent

def experiment():
    gamma = 1.0
    threshold = 0.001
    env = StochasticWindyGridworld(initialize_model=True)
    env.render()
    QIagent = Q_value_iteration(env,gamma,threshold)
    
    # View optimal policy
    done = False
    s = env.reset()
    while not done:
        a = QIagent.select_action(s)
        s_next, r, done = env.step(a)
        env.render(Q_sa=QIagent.Q_sa,plot_optimal_policy=True,step_pause=1.0)
        s = s_next

    # TO DO: Compute mean reward per timestep under the optimal policy
    # print("Mean reward per timestep under optimal policy: {}".format(mean_reward_per_timestep))

if __name__ == '__main__':
    experiment()


Q-value iteration, iteration 1, max error 29.704
Q-value iteration, iteration 2, max error 29.42784
Q-value iteration, iteration 3, max error 23.327616
Q-value iteration, iteration 4, max error 22.28608
Q-value iteration, iteration 5, max error 22.28608
Q-value iteration, iteration 6, max error 22.28608
Q-value iteration, iteration 7, max error 22.28608
Q-value iteration, iteration 8, max error 22.28608
Q-value iteration, iteration 9, max error 22.28608
Q-value iteration, iteration 10, max error 22.28608
Q-value iteration, iteration 11, max error 22.28608
Q-value iteration, iteration 12, max error 3.759488000000001
Q-value iteration, iteration 13, max error 0.8901376000000063
Q-value iteration, iteration 14, max error 0.17802752000000055
Q-value iteration, iteration 15, max error 0.038578762219515284
Q-value iteration, iteration 16, max error 0.008059003797498576
Q-value iteration, iteration 17, max error 0.0016118007595018469
Q-value iteration, iteration 18, max error 0.00032236015190