In [1]:
using LinearAlgebra, StatsBase

In [2]:
n_actions = 2
n_states = 2
P = Array{Float64, 3}(undef, n_actions, n_states, n_states)
# Transition probability matrices for actions 1, 2
P[1, :, :] = [
    0.7 0.3
    0.4 0.6
]

P[2, :, :] = [
    0.1 0.9
    0.8 0.2
]
nothing

In [3]:
R = Array{Float64, 3}(undef, n_actions, n_states, n_states)
# Transition reward matrices for actions 1, 2
R[1, :, :] = [
    11 -4
    -14 6
]

R[2, :, :] = [
    45 80
    1 -23
]
nothing

In [4]:
# One of the 4 possible policies.
mu = (2, 1) # on state 1 perform action 2, on state 2 perform action 1
nothing

In [5]:
# Let's build the transition probability matrix for this policy
Pmu = [
    P[mu[1], 1, :]'
    P[mu[2], 2, :]'
]

2×2 Array{Float64,2}:
 0.1  0.9
 0.4  0.6

In [6]:
# Let's build the transition reward matrix for this policy
Rmu = [
    R[mu[1], 1, :]'
    R[mu[2], 2, :]'
]

2×2 Array{Float64,2}:
  45.0  80.0
 -14.0   6.0

In [7]:
# Print for all possible policies, the transition probability matrix and the
# transition reward matrix
for mu in ((1,1), (1,2), (2,1), (2,2))
    println(mu)
    println([
        P[mu[1], 1, :]'
        P[mu[2], 2, :]'
    ])
    println([
        R[mu[1], 1, :]'
        R[mu[2], 2, :]'
    ])
    println()
end

(1, 1)
[0.7 0.3; 0.4 0.6]
[11.0 -4.0; -14.0 6.0]

(1, 2)
[0.7 0.3; 0.8 0.2]
[11.0 -4.0; 1.0 -23.0]

(2, 1)
[0.1 0.9; 0.4 0.6]
[45.0 80.0; -14.0 6.0]

(2, 2)
[0.1 0.9; 0.8 0.2]
[45.0 80.0; 1.0 -23.0]



In [8]:
# expected immediate reward for state i, given action a
r_bar(i, a) = sum(P[a, i, j] * R[a, i, j] for j in 1:n_states)
# expected immediate reward for the state 1, given action 1
r_bar(1, 1) # 0.7 * 11 + 0.3 * -0.4 = 7.7 - 1.2 = 6.5

6.499999999999999

In [9]:
# Stationary distribution of chain Pmu
PImu = (Pmu^100)[1, :]

2-element Array{Float64,1}:
 0.30769230769230815
 0.6923076923076932

In [10]:
# Average reward for policy mu
sum(PImu[i] * r_bar(i, mu[i]) for i in 1:n_states)

22.153846153846185

In [11]:
    Rμ = [
        R[mu[1], 1, :]'
        R[mu[2], 2, :]'
    ]

2×2 Array{Float64,2}:
  45.0  80.0
 -14.0   6.0

In [12]:
# Exhaustive enumeration to find the best policy O(|A|^|S|)
for μ in ((1,1), (1,2), (2,1), (2,2))
    Pμ = vcat([P[μ[i], i, :]' for i in 1:n_states]...)
    Rμ = vcat([R[μ[i], i, :]' for i in 1:n_states]...)
    
    Πμ = (Pμ^100)[1, :]
    
    ρμ = sum(Πμ[i] * r_bar(i, μ[i]) for i in 1:n_states)
    
    println("μ = $(μ), ρμ = $(round(ρμ, digits=2))")
end

μ = (1, 1), ρμ = 2.86
μ = (1, 2), ρμ = 3.69
μ = (2, 1), ρμ = 22.15
μ = (2, 2), ρμ = 33.99


In [13]:
# Exhaustive enumeration example A
n_actions = 2
n_states = 2
P = Array{Float64, 3}(undef, n_actions, n_states, n_states)
P[1, :, :] = [
    0.7 0.3
    0.4 0.6
]
P[2, :, :] = [
    0.9 0.1
    0.2 0.8
]
R = Array{Float64, 3}(undef, n_actions, n_states, n_states)
R[1, :, :] = [
    6 -5
    7 12
]
R[2, :, :] = [
    10 17
    -14 13
]
for μ in ((1,1), (1,2), (2,1), (2,2))
    Pμ = vcat([P[μ[i], i, :]' for i in 1:n_states]...)
    Rμ = vcat([R[μ[i], i, :]' for i in 1:n_states]...)
    
    Πμ = (Pμ^100)[1, :]
    
    ρμ = sum(Πμ[i] * r_bar(i, μ[i]) for i in 1:n_states)
    
    println("μ = $(μ), ρμ = $(round(ρμ, digits=2))")
end

μ = (1, 1), ρμ = 5.83
μ = (1, 2), ρμ = 5.64
μ = (2, 1), ρμ = 10.56
μ = (2, 2), ρμ = 9.67


In [14]:
# Calculate rho_mu from its definition (by simulation).
# See that the initial value doesnt count very much

# Sigma reward
function simulate(P, R, i0)
    i = i0
    acc = 0
    for _ in 1:10_000
        j = sample(1:2, Weights(P[i, :]))
        acc += R[i, j]
        i = j
    end
    return acc / 10_000
end

println("μ = (1, 1), $(simulate(P[1, :, :], R[1, :, :], 1))")
println("μ = (1, 1), $(simulate(P[1, :, :], R[1, :, :], 2))")
println("μ = (2, 2), $(simulate(P[2, :, :], R[2, :, :], 1))")
println("μ = (2, 2), $(simulate(P[2, :, :], R[2, :, :], 2))")

μ = (1, 1), 5.8112
μ = (1, 1), 5.7742
μ = (2, 2), 9.6081
μ = (2, 2), 9.7085


In [45]:
# Policy Iteration Algorithm

function policy_iteration()
    k = 1
    mu = [1, 1] # an arbitrary starting policy
    
    # how did I calculate this 10.43
    h = [0., 10.43] # arbitrary starting value function
    rho = 5.83 # arbitrary starting average reward
    
    # let's do 1 iteration
    # policy evaluation
    for i in 2:n_states
        h[i] = r_bar(i, mu[i]) - rho + sum(P[mu[i], i, j] * h[j] for j in 1:n_states)
    end

    # policy improvement
    for i in 1:n_states
        mu[i] = argmax([r_bar(i, a) + sum(P[a, i, j]*h[j] for j in 1:n_states) for a in 1:n_actions])
    end
    
    # h doesnt work...
    mu, h
end

policy_iteration()


([2, 1], [0.0, 10.428])

In [46]:
# TODO(Andrea): implement value iteration