In [None]:
include("utilities.jl");

In [None]:
# Mask which defines the observations that can be made
V = [1 1 1; 1 0 1; 1 1 1];

In [None]:
# Specify for which γ, how many experiments are done; also specify the size
k = 10^3;
steps = 100; 
n_min = 1;
n_max = 30;  
n_averages = n_max-n_min+1;
sidelength = 5
w = sidelength;
h = sidelength;
ε = 3*10^-5;
γRange = range(1-k*ε,1-ε,length=steps);

In [None]:
# Read mazes that should be solved
mazes = [zeros(2*sidelength+1, 2*sidelength+1) for j in 1:n_averages];

test = readdlm("mazes/mazes$sidelength.csv", ',', Float64)
test = reshape(test, 100, 2*sidelength+1, 2*sidelength+1)
for j in 1:n_averages
    mazes[j] = test[n_min-1+j,:,:]
end

In [None]:
# Allocate space for storing the times and rewards
timesDPO = zeros(length(γRange), n_averages);
rewardsDPO = zeros(length(γRange), n_averages);

In [None]:
@elapsed for j in 1:n_averages
    M = mazes[j];
    states = listOfStates(M);
    goal = rand(states);
    α = transitionKernel(M,A,goal);
    β = observationKernel(M,V);
    r = instReward(M,A,goal);
    μ = initialDistribution(M);
    (nO, nS) = size(β)
    nA = 4
    statusDPO = [];
    for i in 1:length(γRange)
        γ = γRange[i]
        # DPO
        obj(θ) = - RExact(softmaxPolicy(θ, nA, nO), α, β, γ, μ, r);
        solutionDPO = optimize(obj, zeros(nA*nO), LBFGS(), Optim.Options(g_tol=1e-3))
        rewardsDPO[i,j] = - Optim.minimum(solutionDPO)
        timesDPO[i,j] = Optim.time_run(solutionDPO)
        statusDPO = vcat(statusDPO, Optim.converged(solutionDPO))
    end
    index = j+n_min-1
    writedlm("data/DPOSizeFixedRewards$index.csv", rewardsDPO[:,j], ',')
    writedlm("data/DPOSizeFixedTimes$index.csv", timesDPO[:,j], ',')
    writedlm("data/DPOSizeFixedStatus$index.csv", statusDPO, ',')
end