In [1]:
import sys
import numpy as np
import math
import random

In [2]:
import gym
import gym_maze

In [3]:
def simulate():
    
    # Instantiating the learning related parameters
    learning_rate = get_learning_rate(0) # alpha
    explore_rate = get_explore_rate(0)
    discount_factor = 0.99 #gamma
    eligibility_decay = 0.3
    epsilon = 1
    epsilon_decay = 0.999
    trace_decay = 0

    '''????????'''
    num_streaks = 0 

    # Render tha maze
    env.render()
    
    #q_table # inicializar ela aqui.
    q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,), dtype=float)

    for episode in range(NUM_EPISODES):
        # Reset the environment
        obv = env.reset()
        
        # initialize Reward
        total_reward = 0
        
        # initialize episolon decay
        epsilon *= epsilon_decay

        # E(S,A) = 0
        E = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,), dtype=float)
        
        # Initialize state
        state = state_to_bucket(obv)
        
        # initialize action    # VERIFICAR SE SEGUE A POLITICA   
        action = select_action(state, explore_rate, q_table)
        
        for t in range(MAX_T):

            # Select an action
            next_action = select_action(state, explore_rate, q_table)
            
            # execute the action
            obv, reward, done, _ = env.step(action)

            # Observe the result
            next_state = state_to_bucket(obv)
            total_reward += reward
            
            # current_q
            current_q = q_table[state + (action,)] 
            
            # next_q
            next_q = q_table[next_state + (next_action,)]
            
            # TD - r + gamma * Q(S', A') - Q(S,A)
            TD = reward + (discount_factor * (next_q) - current_q )
            
            # E(S,A) - E(S,A) + 1
            E[state + (action,)] += 1
            
            #For all s E S, a E A(s)
            
            # aux_1
            aux_1 = learning_rate * TD * E
            
            # Q(S,A) - Q(S,A) + alpha * lamb * E
            q_table += aux_1
            
            
            # E(S, A) -  gama * trace_decay * E * delta * E(S,A)
            # delta = TD
            
            E = discount_factor * trace_decay * E
            
            #S - S'
            state = next_state
            
            # A - A'
            action = next_action
            
            # Print data
            if DEBUG_MODE == 2:
                print("\nEpisode = %d" % episode)
                print("t = %d" % t)
                print("Action: %d" % action)
                print("State: %s" % str(state))
                print("Reward: %f" % reward)
                print("Best Q: %f" % best_q)
                print("Explore rate: %f" % explore_rate)
                print("Learning rate: %f" % learning_rate)
                print("Streaks: %d" % num_streaks)
                print("")

            elif DEBUG_MODE == 1:
                if done or t >= MAX_T - 1:
                    print("\nEpisode = %d" % episode)
                    print("t = %d" % t)
                    print("Explore rate: %f" % explore_rate)
                    print("Learning rate: %f" % learning_rate)
                    print("Streaks: %d" % num_streaks)
                    print("Total reward: %f" % total_reward)
                    print("")

            # Render tha maze
            if RENDER_MAZE:
                env.render()

            if env.is_game_over():
                sys.exit()

            if done:
                print("Episode %d finished after %f time steps with total reward = %f (streak %d)."
                      % (episode, t, total_reward, num_streaks))

                if t <= SOLVED_T:
                    num_streaks += 1
                else:
                    num_streaks = 0
                break

            elif t >= MAX_T - 1:
                print("Episode %d timed out at %d with total reward = %f."
                      % (episode, t, total_reward))

        # It's considered done when it's solved over 120 times consecutively
        if num_streaks > STREAK_TO_END:
            break

        # Update parameters
        explore_rate = get_explore_rate(episode)
        learning_rate = get_learning_rate(episode)

In [4]:
def select_action(state, explore_rate, q_table):
    # Select a random action
    if random.random() < explore_rate:
        action = env.action_space.sample()
    # Select the action with the highest q
    else:
        action = int(np.argmax(q_table[state]))
    return action

In [5]:
def get_explore_rate(t):
    return max(MIN_EXPLORE_RATE, min(0.8, 1.0 - math.log10((t+1)/DECAY_FACTOR)))

In [6]:
def get_learning_rate(t):
    return max(MIN_LEARNING_RATE, min(0.8, 1.0 - math.log10((t+1)/DECAY_FACTOR)))

In [7]:
def state_to_bucket(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] - 1
        else:
            # Mapping the state bounds to the bucket array
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_BUCKETS[i]-1)*STATE_BOUNDS[i][0]/bound_width
            scaling = (NUM_BUCKETS[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)

In [8]:
if __name__ == "__main__":

    # Initialize the "maze" environment
    env = gym.make("maze-random-10x10-plus-v0")

    '''
    Defining the environment related constants
    '''
    # Number of discrete states (bucket) per state dimension
    MAZE_SIZE = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int))
    NUM_BUCKETS = MAZE_SIZE  # one bucket per grid

    # Number of discrete actions
    NUM_ACTIONS = env.action_space.n  # ["N", "S", "E", "W"]
    # Bounds for each discrete state
    STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))

    '''
    Learning related constants
    '''
    MIN_EXPLORE_RATE = 0.001
    MIN_LEARNING_RATE = 0.2
    DECAY_FACTOR = np.prod(MAZE_SIZE, dtype=float) / 10.0

    '''
    Defining the simulation related constants
    '''
    NUM_EPISODES = 50000
    MAX_T = np.prod(MAZE_SIZE, dtype=int) * 100
    STREAK_TO_END = 100
    SOLVED_T = np.prod(MAZE_SIZE, dtype=int)
    DEBUG_MODE = 0
    RENDER_MAZE = True
    ENABLE_RECORDING = True

    
    '''
    Begin simulation
    '''
    recording_folder = "/tmp/maze_q_learning"

    if ENABLE_RECORDING:
        #env.monitor.start(recording_folder, force=True)
        env = gym.wrappers.Monitor(env, "recording", force=True)
    simulate()

    if ENABLE_RECORDING:
        #env.monitor.close()
        env.close()
        

pygame 2.0.0 (SDL 2.0.12, python 3.7.4)
Hello from the pygame community. https://www.pygame.org/contribute.html
Episode 0 finished after 566.000000 time steps with total reward = 0.434000 (streak 0).
Episode 1 finished after 5404.000000 time steps with total reward = -4.404000 (streak 0).
Episode 2 finished after 446.000000 time steps with total reward = 0.554000 (streak 0).
Episode 3 finished after 1033.000000 time steps with total reward = -0.033000 (streak 0).
Episode 4 finished after 143.000000 time steps with total reward = 0.857000 (streak 0).
Episode 5 finished after 3284.000000 time steps with total reward = -2.284000 (streak 0).
Episode 6 finished after 528.000000 time steps with total reward = 0.472000 (streak 0).
Episode 7 finished after 216.000000 time steps with total reward = 0.784000 (streak 0).
Episode 8 finished after 192.000000 time steps with total reward = 0.808000 (streak 0).
Episode 9 finished after 347.000000 time steps with total reward = 0.653000 (streak 0).
Ep

Episode 92 finished after 35.000000 time steps with total reward = 0.965000 (streak 41).
Episode 93 finished after 32.000000 time steps with total reward = 0.968000 (streak 42).
Episode 94 finished after 31.000000 time steps with total reward = 0.969000 (streak 43).
Episode 95 finished after 31.000000 time steps with total reward = 0.969000 (streak 44).
Episode 96 finished after 31.000000 time steps with total reward = 0.969000 (streak 45).
Episode 97 finished after 31.000000 time steps with total reward = 0.969000 (streak 46).
Episode 98 finished after 31.000000 time steps with total reward = 0.969000 (streak 47).
Episode 99 finished after 31.000000 time steps with total reward = 0.969000 (streak 48).
Episode 100 finished after 31.000000 time steps with total reward = 0.969000 (streak 49).
Episode 101 finished after 31.000000 time steps with total reward = 0.969000 (streak 50).
Episode 102 finished after 31.000000 time steps with total reward = 0.969000 (streak 51).
Episode 103 finish