# Exercise 5.12: Racetrack

This notebook contains an implementation of an off-policy Monte Carlo control algorithm to solve the racetrack problem. The racetrack is represented using a two-dimensional array. In retrospect, it would have been easier had the array been oriented with an origin on the lower left corner insted of the upper left corner.

The fully-observable state is represented using a 4-tuple consisting of car location x, car location y, velocity horizontal and velocity vertical.

A remaining question is how to properly handle the boundary/finishing line intersection (i.e. how to calculate the path). This implementation simply constructs a matrix of all possible paths and if one path hits the boundary, the car is respawned at the starting line.

The algorithm takes pretty long to train, thus it had to be ended prematurely.

In [13]:
using Base.Iterators
using StatsBase

In [14]:
gamma = 1.
epsilon = 0.1
actions = collect(product([-1, 0, 1], [-1, 0, 1]))
episodes = 100000

100000

In [15]:
mutable struct Racetrack
    positions::Array{Int32,2}
    velocity_horizontal::Int32
    velocity_vertical::Int32
    car_location::CartesianIndex{2}
    
    Racetrack(positions::Array{Int32,2}) = new(positions, 0, 0, CartesianIndex(0,0))
end

"""
Car spawns only at starting line positions
"""
car_spawn_location(r::Racetrack) = rand(findall(r.positions .== 1))

function reset_state(r::Racetrack)
    r.car_location = car_spawn_location(r)
    r.velocity_horizontal = 0
    r.velocity_vertical = 0
end

"""
State representation which can be used by arrays
"""
current_state(r::Racetrack) = CartesianIndex(r.car_location[1], r.car_location[2], r.velocity_horizontal + 1, r.velocity_vertical + 1)

function act(r::Racetrack, action_horizontal, action_vertical; noise::Bool=true)
    reward = -1
    episode_end = false
    zero_increments = rand() <= .1
    if zero_increments && noise
        action_horizontal = 0
        action_vertical = 0
    end
    new_velocity_horizontal = min(max(r.velocity_horizontal + action_horizontal, 0), 5)
    new_velocity_vertical = min(max(r.velocity_vertical + action_vertical, 0), 5)
    if !(new_velocity_horizontal == new_velocity_vertical == 0)
        r.velocity_horizontal = new_velocity_horizontal
        r.velocity_vertical = new_velocity_vertical
    end
    new_location_x = r.car_location[2] + r.velocity_horizontal
    new_location_y = r.car_location[1] - r.velocity_vertical
    safe_index_x = min(max(new_location_x, 1), size(r.positions)[2])
    safe_index_y = max(new_location_y, 1)
    episode_end = 3 in r.positions[r.car_location[1]:-1:safe_index_y, r.car_location[2]:safe_index_x]
    hit_boundary = r.positions[safe_index_y, safe_index_x] == 2
    r.car_location = CartesianIndex(new_location_y, new_location_x)
    if episode_end || hit_boundary
        reset_state(r)
    end
    if episode_end
        reward = 0
    end
    reward, episode_end
end

act (generic function with 1 method)

In [16]:
"""
Generates a racetrack using a 2-dimensional array. Uses the left-hand racetrack of Figure 5.5 as inspiration. Values of the array indicate:

0: valid position, 1: starting line, 2: boundary, 3: finishing line
"""
function generate_positions()
    positions = zeros(Int32, 33, 18)
    positions[33, 5:11] .= 1
    positions[9:33, 12:end] .= 2
    positions[8, 13:end] .= 2
    positions[31:33, 1:4] .= 2
    positions[24:30, 1:3] .= 2
    positions[16:23, 1:2] .= 2
    positions[6:15, 1] .= 2
    positions[5, 1:2] .= 2
    positions[3:4, 1:3] .= 2
    positions[2, 1:4] .= 2
    positions[1, 1:18] .= 2
    positions[2:7, 18] .= 3
    positions
end

"""
Runs an episode of the racetrack problem using an epsiolen-greedy policy and optional noise.
"""
function run_episode(r::Racetrack, policy, actions; epsilon=0.1, noise=true)
    episode_end = false
    soft_policy_prob = []
    rewards = []
    states = []
    episode_actions = []
    while !episode_end
        state = current_state(r)
        push!(states, state)
        explore = rand() < epsilon
        if explore
            action_index = rand(1:length(actions))
            push!(soft_policy_prob, epsilon / length(actions))
        else
            action_index = policy[state]
            push!(soft_policy_prob, 1 - epsilon + epsilon / length(actions))
        end
        action = actions[action_index]
        push!(episode_actions, action_index)
        reward, episode_end = act(r, action[1], action[2], noise=noise)
        push!(rewards, reward)
    end
    
    states, episode_actions, soft_policy_prob, rewards
end

run_episode

In [17]:
"""
Off-policy Monte Carlo algorithm to solve the racetrack problem
"""
function policy_improvement(r::Racetrack, gamma::Float64, actions, episodes::Int64; epsilon=0.1)
    num_actions = length(actions)
    car_locations = findall(positions .<= 1)
    velocity_horizontal_states = 0:5
    velocity_vertical_states = 0:5
    state_dims = (size(r.positions)[1], size(r.positions)[2], length(velocity_horizontal_states), length(velocity_vertical_states))
    state_action_dims = (size(r.positions)[1], size(r.positions)[2], length(velocity_horizontal_states), length(velocity_vertical_states), num_actions)
    state_iterator = product(car_locations, velocity_horizontal_states, velocity_vertical_states)
    values = zeros(state_action_dims)
    cum_weights = zeros(state_action_dims)
    policy = zeros(Int, state_dims)
    for (car_location, velocity_horizontal_state, velocity_vertical_state) in state_iterator
        state = CartesianIndex(car_location[1], car_location[2], velocity_horizontal_state + 1, velocity_vertical_state + 1)
        values[state, :] = rand(num_actions)
        policy[state] = argmax(values[state, :])
    end
    
    for episode in 1:episodes
        reset_state(r)
        states, episode_actions, soft_policy_prob, rewards = run_episode(r, policy, actions, epsilon=epsilon)
        
        G = 0
        W = 1
        for step in length(states):-1:1
            G = gamma * G + rewards[step]
            state = states[step]
            action = episode_actions[step]
            cum_weights[state, action] += W
            values[state, action] += (W / cum_weights[state, action]) * (G - values[state, action])
            policy[state] = argmax(values[state, :])
            if action != policy[state]
                break
            end
            W *= 1 / soft_policy_prob[step]
        end
    end
    
    policy
end


"""
Off-policy Monte Carlo algorithm with truncated weighted importance sampling to solve the racetrack problem
"""
function policy_improvement_importance(r::Racetrack, gamma::Float64, actions, episodes::Int64; epsilon=0.1)
    num_actions = length(actions)
    car_locations = findall(positions .<= 1)
    velocity_horizontal_states = 0:5
    velocity_vertical_states = 0:5
    state_dims = (size(r.positions)[1], size(r.positions)[2], length(velocity_horizontal_states), length(velocity_vertical_states))
    state_action_dims = (size(r.positions)[1], size(r.positions)[2], length(velocity_horizontal_states), length(velocity_vertical_states), num_actions)
    state_iterator = product(car_locations, velocity_horizontal_states, velocity_vertical_states)
    values = zeros(state_action_dims)
    R = zeros(state_action_dims)
    P = zeros(state_action_dims)
    cum_weights = zeros(state_action_dims)
    policy = zeros(Int, state_dims)
    for (car_location, velocity_horizontal_state, velocity_vertical_state) in state_iterator
        state = CartesianIndex(car_location[1], car_location[2], velocity_horizontal_state + 1, velocity_vertical_state + 1)
        values[state, :] = rand(num_actions)
        policy[state] = argmax(values[state, :])
    end
    
    for episode in 1:episodes
        reset_state(r)
        states, episode_actions, soft_policy_prob, rewards = run_episode(r, policy, actions, epsilon=epsilon)
        
        T = length(states)
        G = zeros(T)
        rho = 1
        W = zeros(T)
        W[T] = 1
        for step in T:-1:1
            G[step:T] .+= rewards[step]
            state = states[step]
            action = episode_actions[step]
            if step != T
                R[state, action] += (1 - gamma) * (sum(W[h] * G[h] for h=step:(T-1))) + gamma ^ (T - step - 1) * rho * G[T]
                P[state, action] += (1 - gamma) * sum(W[step:(T-1)]) + gamma ^ (T - step - 1) * rho
            else
                R[state, action] += G[T]
                P[state, action] += 0
            end
            values[state, action] = R[state, action] / P[state, action]
            policy[state] = argmax(values[state, :])
            if action != policy[state]
                break
            end
            W[step - 1] = W[step] * gamma^(step-1) / soft_policy_prob[step]
            rho *= 1 / soft_policy_prob[step]
        end
    end
    
    policy
end


policy_improvement_importance

In [18]:
positions = generate_positions()

33×18 Array{Int32,2}:
 2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
 2  2  2  2  0  0  0  0  0  0  0  0  0  0  0  0  0  3
 2  2  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3
 2  2  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3
 2  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3
 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3
 2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  3
 2  0  0  0  0  0  0  0  0  0  0  0  2  2  2  2  2  2
 2  0  0  0  0  0  0  0  0  0  0  2  2  2  2  2  2  2
 2  0  0  0  0  0  0  0  0  0  0  2  2  2  2  2  2  2
 2  0  0  0  0  0  0  0  0  0  0  2  2  2  2  2  2  2
 2  0  0  0  0  0  0  0  0  0  0  2  2  2  2  2  2  2
 2  0  0  0  0  0  0  0  0  0  0  2  2  2  2  2  2  2
 ⋮              ⋮              ⋮              ⋮     
 2  2  0  0  0  0  0  0  0  0  0  2  2  2  2  2  2  2
 2  2  0  0  0  0  0  0  0  0  0  2  2  2  2  2  2  2
 2  2  2  0  0  0  0  0  0  0  0  2  2  2  2  2  2  2
 2  2  2  0  0  0  0  0  0  0  0  2  2  2  2  2  2  2
 2  2  

In [19]:
racetrack = Racetrack(positions)
reset_state(racetrack)
racetrack.car_location

CartesianIndex(33, 11)

In [20]:
policy = policy_improvement(racetrack, gamma, actions, episodes, epsilon=epsilon)

33×18×6×6 Array{Int64,4}:
[:, :, 1, 1] =
 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
 0  0  0  0  3  5  9  2  3  9  7  6  5  8  6  1  3  0
 0  0  0  9  5  2  4  6  3  1  6  2  1  4  8  4  6  0
 0  0  0  8  6  9  5  4  2  3  9  1  8  4  7  7  7  0
 0  0  1  8  1  3  2  7  4  1  3  8  2  6  9  9  7  0
 0  4  7  5  7  9  1  8  7  7  9  9  4  9  1  1  1  0
 0  3  3  3  9  4  5  9  5  8  1  5  4  1  5  7  8  0
 0  2  5  3  6  9  1  1  7  8  7  1  0  0  0  0  0  0
 0  2  9  3  1  6  9  5  3  7  9  0  0  0  0  0  0  0
 0  4  8  7  1  3  4  7  7  8  8  0  0  0  0  0  0  0
 0  8  6  3  5  2  2  9  9  7  4  0  0  0  0  0  0  0
 0  8  7  1  8  2  9  3  2  6  2  0  0  0  0  0  0  0
 0  2  8  9  9  4  1  4  2  4  4  0  0  0  0  0  0  0
 ⋮              ⋮              ⋮              ⋮     
 0  0  7  7  7  5  1  3  2  5  4  0  0  0  0  0  0  0
 0  0  9  2  8  1  9  1  6  6  2  0  0  0  0  0  0  0
 0  0  0  9  1  4  7  4  5  8  9  0  0  0  0  0  0  0
 0  0  0  9  4  8  8  7  9  7  6  0  0  0 

In [21]:
function draw_trajectory(racetrack, policy, actions, positions)
    states, episode_actions, soft_policy_prob, rewards = run_episode(racetrack, policy, actions, epsilon=1., noise=false)
    steps = length(states)
    positions_vis = copy(positions)
    for step in 1:steps
        y, x = states[step][1], states[step][2]
        positions_vis[y, x] = step
    end
    positions_vis
end

draw_trajectory (generic function with 1 method)

In [26]:
draw_trajectory(racetrack, policy, actions, positions)

33×18 Array{Int32,2}:
 2  2  2  2    2    2    2    2    2    2    2    2    2    2  2    2    2  2
 2  2  2  2    0    0    0    0    0    0    0    0    0    0  0    0    0  3
 2  2  2  0    0    0    0    0    0    0    0    0    0    0  0    0    0  3
 2  2  2  0    0    0    0    0    0    0    0    0    0    0  0    0    0  3
 2  2  0  0    0    0    0    0    0    0  219    0    0    0  0    0  268  3
 2  0  0  0    0    0    0    0    0    0    0    0  265  266  0  267    0  3
 2  0  0  0    0    0    0    0    0    0  263  264    0    0  0    0    0  3
 2  0  0  0    0    0    0    0    0    0  262    0    2    2  2    2    2  2
 2  0  0  0    0    0    0    0    0    0  261    2    2    2  2    2    2  2
 2  0  0  0    0    0    0    0    0    0    0    2    2    2  2    2    2  2
 2  0  0  0    0    0    0    0    0    0  260    2    2    2  2    2    2  2
 2  0  0  0    0    0    0    0    0  217    0    2    2    2  2    2    2  2
 2  0  0  0    0    0    0    0    0   58 

In [25]:
draw_trajectory(racetrack, policy, actions, positions)

33×18 Array{Int32,2}:
 2  2  2  2    2    2    2    2    2    2    2    2    2    2  2    2  2  2
 2  2  2  2    0    0    0    0    0    0    0    0  409    0  0  539  0  3
 2  2  2  0    0    0    0  172    0  340    0    0    0    0  0    0  0  3
 2  2  2  0    0    0    0    0   84    0    0    0  538    0  0    0  0  3
 2  2  0  0    0    0    0    0    0    0    0    0    0  224  0    0  0  3
 2  0  0  0    0    0    0  171    0  339   66    0    0    0  0    0  0  3
 2  0  0  0    0    0    0   83    0  537  408    0    0    0  0    0  0  3
 2  0  0  0    0    0    0    0    0    0    0  223    2    2  2    2  2  2
 2  0  0  0    0    0    0   82  338    0    0    2    2    2  2    2  2  2
 2  0  0  0    0    0    0  170    0    0   65    2    2    2  2    2  2  2
 2  0  0  0    0    0    0  536  337    0  222    2    2    2  2    2  2  2
 2  0  0  0    0    0    0   81  407    0  318    2    2    2  2    2  2  2
 2  0  0  0    0    0    0    0  336  507   64    2    2    2  2  

In [24]:
draw_trajectory(racetrack, policy, actions, positions)

33×18 Array{Int32,2}:
 2  2  2  2  2  2  2   2   2   2   2  2   2  2  2   2  2  2
 2  2  2  2  0  0  0   0   0   0   0  0   0  0  0   0  0  3
 2  2  2  0  0  0  0   0   0   0   0  0   0  0  0   0  0  3
 2  2  2  0  0  0  0   0   0   0   0  0   0  0  0  30  0  3
 2  2  0  0  0  0  0   0   0  27  28  0  29  0  0   0  0  3
 2  0  0  0  0  0  0   0   0  26   0  0   0  0  0   0  0  3
 2  0  0  0  0  0  0   0   0   0   0  0   0  0  0   0  0  3
 2  0  0  0  0  0  0   0  25   0   0  0   2  2  2   2  2  2
 2  0  0  0  0  0  0   0   0   0   0  2   2  2  2   2  2  2
 2  0  0  0  0  0  0   0   0   0   0  2   2  2  2   2  2  2
 2  0  0  0  0  0  0   0  24   0   0  2   2  2  2   2  2  2
 2  0  0  0  0  0  0   0   0   0   0  2   2  2  2   2  2  2
 2  0  0  0  0  0  0   0  23   0   0  2   2  2  2   2  2  2
 ⋮              ⋮                  ⋮                ⋮     
 2  2  0  0  0  0  0   0  20   0   0  2   2  2  2   2  2  2
 2  2  0  0  0  0  0   0   0   0   0  2   2  2  2   2  2  2
 2  2  2  0  0  0  