In [None]:
include("main.jl")
using PyCall
using CairoMakie
sepsis_gym = pyimport("custom_sepsis")
using Statistics


In [None]:
struct MeansType
    individual_runs::Vector{Vector{Float64}}
    mean_rewards::Vector{Float64}
    smoothed_mean::Vector{Float64}
    smoothed_std::Vector{Float64}
    keys_of_smoothed::Vector{Float64}
end
function calculate_mean_rewards(results, window_size)
    rews = Dict()
    means = Dict()
    for (type, run) in results
        all_keys = [sort(collect(keys(model.mean_rewards))) for model in run]
        min_keys = minimum(length.(all_keys))
        ks = all_keys[1][1:min_keys]
        filled_rewards = []
        all_rewards = []
        rews = []
        for model in run
            rewards = [model.mean_rewards[1]]
            for i in 2:min_keys
                key = all_keys[1][i]
                mult_factor = key - all_keys[1][i-1]
                push!(rewards, fill(model.mean_rewards[key], mult_factor)...)
            end
            push!(all_rewards, [model.mean_rewards[k] for k in ks])
            push!(filled_rewards, rewards)
            push!(rews, rewards)
        end
        mean_rewards = mean(all_rewards)
        std_rewards = std(all_rewards)
        means[type] = MeansType( rews,  mean(filled_rewards),  moving_avg(mean_rewards, window_size),  Float64.(moving_avg(std_rewards, window_size)),  Float64.(ks))
    end
    return means
end

In [None]:
ts = Dict(
    :Simple100 => [sepsis_gym.DirThompsonSampling.load_json("json/dirichlet/ts/Simple-$i.json") for i in 0:2],
    :Medium100 => [sepsis_gym.DirThompsonSampling.load_json("json/dirichlet/ts/Medium-$i.json") for i in 0:2],
    :None100 => [sepsis_gym.DirThompsonSampling.load_json("json/dirichlet/ts/None-$i.json") for i in 0:2],
    :Softmax100 => [load_jld("data/mcmc/runs/SoftmaxPPL-$i.jld")  for i in 1:3],
    :SimplePPL100 => [load_jld("data/mcmc/runs/SimplePPL-$i.jld")  for i in 1:3],
    :Simple1 => [sepsis_gym.DirThompsonSampling.load_json("json/dirichlet/ts/Simple-every-$i.json") for i in 1:3],
    :Medium1 => [sepsis_gym.DirThompsonSampling.load_json("json/dirichlet/ts/Medium-every-$i.json") for i in 1:3],
    :Softmax1 => [load_jld("data/mcmc/runs/SoftmaxPPL-every-$i.jld")  for i in 1:3],
    :SimplePPL1 => [load_jld("data/mcmc/runs/SimplePPL-every-$i.jld")  for i in 1:3],
)

In [None]:
means = calculate_mean_rewards(ts, 5)

In [None]:

struct MeanRewardsType
    mean_rewards::Vector{Float64}
    individual_runs::Vector{Vector{Float64}}
    smoothed_mean::Vector{Float64}
    smoothed_std::Vector{Float64}
    name::String
    info::Dict
end
function load_rewards_from_json(file_path)
    json_data = JSON3.read(file_path)
    mean_rewards = [Float64(rew) for rew in json_data["mean_rewards"]]
    individual_runs = [[Float64(rew) for rew in r] for r in json_data["individual_runs"]]
    smoothed_mean = [Float64(rew) for rew in json_data["smoothed_mean"]]
    smoothed_std = [Float64(rew) for rew in json_data["smoothed_std"]]
    name = json_data["name"]
    info = Dict(
        string(k) => string(v) for (k, v) in json_data["info"]
    )
    return MeanRewardsType(mean_rewards, individual_runs, smoothed_mean, smoothed_std, name, info)
end

In [None]:
rewards = Dict(
    :Simple100 => means[:Simple100],
    :Medium100 => means[:Medium100],
    :None100 => means[:None100],
    :Softmax100 => means[:Softmax100],
    :SimplePPL100 => means[:SimplePPL100],
    :Simple1 => means[:Simple1],
    :Medium1 => means[:Medium1],
    :Softmax1 => means[:Softmax1],
    :SimplePPL1 => means[:SimplePPL1],
    :DQN_SE => load_rewards_from_json("json/dqn/sample_eff.json"),
    # :DQN_AR => load_rewards_from_json("json/dqn/asympt_results.json"),
    # :DQN_CR => load_rewards_from_json("json/dqn/cumul_results.json"),
    :QLearning => load_rewards_from_json("json/qlearning/q_learning_results.json"),
)

In [None]:
function plot_mean_rewards(means, batch_size, types, x_lim=15000)
    fig = Figure(resolution=(900, 500))  
    ax = Axis(fig[1, 1], xlabel = "Number of Episodes", ylabel = "Mean Reward Across 100'000 Episodes", title = "Mean Rewards for Thompson Sampling with Batch Size $batch_size")

    ts_types = [:Simple100, :Medium100, :None100, :Softmax100, :SimplePPL100, :Simple1, :Medium1, :Softmax1, :SimplePPL1]
    mfrl_types = [:DQN_SE, :DQN_AR, :DQN_CR, :QLearning]
    filtered_ts = filter(x -> x in types, ts_types)
    filtered_mfrl = filter(x -> x in types, mfrl_types)
    # Data storage for accessing smoothed values later

    ks = []
    for (i, type) in enumerate(filtered_ts)
        for mean in means[type].individual_means
            lines!(ax, mean, color=(colors_dict[type], 0.2))
        end
        ks = means[type].keys_of_smoothed
        smoothed_mean_rewards = means[type].smoothed_mean
        lines!(ax, ks, Float64.(smoothed_mean_rewards), color=colors_dict[type], linewidth=1.5, label=label_dict[type])
        smoothed_std_rewards = means[type].smoothed_std
        low = smoothed_mean_rewards .- smoothed_std_rewards
        high = smoothed_mean_rewards .+ smoothed_std_rewards
        band!(ax, ks, low, high, color=(colors_dict[type], 0.2))
    end
    len = x_lim 

    lines!(ax, 1:len, fill(random_mean, len), color=:black, linestyle=:dash, label="Random Policy")
    # axislegend(ax, position=(:right, :bottom))

    if x_lim != nothing
        xlims!(ax, 0, x_lim)
    end

    for type in filtered_mfrl
        smoothed = moving_avg(means[type].mean_rewards, 100)
        lines!(ax, 1:length(smoothed), smoothed, color=(colors_dict[type]),label=label_dict[type])
    end
    
    Legend(fig[1, 2], ax, position = :right)
    
    ylims!(ax, -1, 0)
    ax.yticks = -1:0.05:0

    return fig, ax

end


In [None]:
fig, ax = plot_mean_rewards(means, 100, [:SimplePPL100, :Softmax100, :Simple100, :Medium100, :None100], 50000)
save("plots/ts100DQN1M.png", fig)
fig

In [None]:
latex_labels = Dict(
    :Softmax => "\$\\mathsf{SoftmaxPPL}\$",
    :None => "\$\\mathsf{FullDBN}\$",
    :Medium => "\$\\mathsf{MediumDBN}\$",
    :DQN => "\$\\mathsf{DQN}\$",
    :SmoothedDQN => "\$\\mathsf{DQN}\$",
    :Simple => "\$\\mathsf{SimpleDBN}\$",
    :SimplePPL => "\$\\mathsf{SimplePPL}\$",
    :Softmax100 => "\$\\mathsf{SoftmaxPPL\\_100}\$",
    :None100 => "\$\\mathsf{FullDBN\\_100}\$",
    :Medium100 => "\$\\mathsf{MediumDBN\\_100}\$",
    :SmoothedDQN100 => "\$\\mathsf{DQN\\_100}\$",
    :Simple100 => "\$\\mathsf{SimpleDBN\\_100}\$",
    :SimplePPL100 => "\$\\mathsf{SimplePPL\\_100}\$",
    :Softmax1 => "\$\\mathsf{SoftmaxPPL\\_1}\$",
    :Medium1 => "\$\\mathsf{MediumDBN\\_1}\$",
    :SmoothedDQN1 => "\$\\mathsf{DQN\\_1}\$",
    :Simple1 => "\$\\mathsf{SimpleDBN\\_1}\$",
    :SimplePPL1 => "\$\\mathsf{SimplePPL\\_1}\$",
    :DQN_SE => "\$\\mathsf{DQN\\_SE}\$",
    :DQN_AR => "\$\\mathsf{DQN\\_AR}\$",
    :DQN_CR => "\$\\mathsf{DQN\\_CR}\$",
    :QLearning => "\$\\mathsf{QLearning}\$",
)

In [None]:
function get_cumsums(means)
    cumsums = Dict()
    for (type, mean) in means
        cumsums[type] = cumsum(mean.mean_rewards, dims=1)
    end
    return cumsums
end

In [None]:
function get_sample_efficiency(means, checkpoints=[-1:0.05:1])
    sample_eff = Dict()
    for (type, mean) in means
        sample_eff[type] = Dict()
        for checkpoint in checkpoints
            index = findfirst(x -> x >= checkpoint, mean.mean_rewards)
            sample_eff[type][checkpoint] = isnothing(index) ? NaN : index
        end
    end
    return sample_eff  
end

In [None]:
function get_asymptotic_reward(means, checkpoints=[0.5,0.7,0.9,0.99])
    asymptotic_rewards = Dict{Symbol, Float64}()
    convergence_speeds = Dict{Symbol, Dict{Float64, Int}}()
    lengths = Dict{Symbol, Int}()

    for (type, m) in means
        rewards = m.mean_rewards
        # Store the length of the rewards array
        lengths[type] = length(rewards)
        last_n = Int(round(lengths[type] / 10))

        # Asymptotic mean reward
        if !isempty(rewards)
            asymptotic_rewards[type] = mean(rewards[end-last_n:end])
        else
            asymptotic_rewards[type] = NaN
        end

        # Convergence speed for each checkpoint
        convergence_speeds[type] = Dict()
        for checkpoint in checkpoints
            min_reward = -1
            max_reward = maximum(rewards)
            reward_range = max_reward - min_reward
            threshold_value = min_reward + checkpoint * reward_range
            index = findfirst(x -> x >= threshold_value, rewards)
            if isnothing(index)
                convergence_speeds[type][checkpoint] = NaN  # Indicate no convergence
            else
                convergence_speeds[type][checkpoint] = index
            end
        end
    end
    return asymptotic_rewards, convergence_speeds, lengths
end

In [None]:
type_order = [[:None100, :Medium100, :Simple100,:SimplePPL100, :Softmax100,], [:Medium1, :Simple1, :SimplePPL1, :Softmax1], [:QLearning]]

In [None]:
using Printf

# Helper function to format best/worst values across types with your adjustments
function format_best_worst_across_types(values_matrix; is_higher_better=true)
    rows, cols = size(values_matrix)
    formatted_matrix = Array{String}(undef, rows, cols)  # Initialize a matrix to store formatted values

    for col in 1:cols
        column_values = values_matrix[:, col]
        max_val = maximum(column_values[.!isnan.(column_values)])
        min_val = minimum(column_values[.!isnan.(column_values)])
        for row in 1:rows
            v = column_values[row]
            if isnan(v)
                formatted_matrix[row, col] = "-"
            else
                formatted_value = v
                if v isa AbstractFloat
                    formatted_value = round(v, digits=3)
                end
                if v == max_val && is_higher_better
                    formatted_matrix[row, col] = "\\color{blue}{\$$(formatted_value)\$}"
                elseif v == min_val && !is_higher_better
                    formatted_matrix[row, col] = "\\color{blue}{\$$(formatted_value)\$}"
                elseif v == min_val && is_higher_better
                    formatted_matrix[row, col] = "\\color{red}{\$$(formatted_value)\$}"
                elseif v == max_val && !is_higher_better
                    formatted_matrix[row, col] = "\\color{red}{\$$(formatted_value)\$}"
                else
                    formatted_matrix[row, col] = "\$$(formatted_value)\$"
                end
            end
        end
    end
    return formatted_matrix
end

# Generate LaTeX table for cumulative rewards
function generate_cumulative_rewards_table(cumsums, checkpoints)
    types = collect(keys(cumsums))
    cols = length(checkpoints)
    
    # Collect rewards for each type at each checkpoint into a matrix
    values_matrix = [length(cumsums[type]) >= chk ? cumsums[type][chk-1] : NaN for type in types, chk in checkpoints]

    # Format the values with best/worst highlighting
    formatted_matrix = format_best_worst_across_types(values_matrix, is_higher_better=true)

    # Create LaTeX table
    header = "\\hline\n & \$" * join(checkpoints, "\$ & \$") * "\$ \\\\\n\\hline\n"
    rows = Dict()
    for (i, type) in enumerate(types)
        rows[type] = "$(latex_labels[type]) & " * join(formatted_matrix[i, :], " & ") * " \\\\\n"
    end
    rows_text = ""
    for ts in type_order
        for type in ts
            rows_text *= rows[type]
        end
        rows_text *= "\\hline\n"
    end
    return "\\begin{tabular}{|l|" * "r"^cols * "|}\n\\hline\n" * header * rows_text * "\\hline\n\\end{tabular}"
end

# Generate LaTeX table for sample efficiency
function generate_sample_efficiency_table(sample_eff, checkpoints, lengths)
    types = collect(keys(sample_eff))
    cols = length(checkpoints)
    
    # Collect sample efficiencies into a matrix
    values_matrix = [sample_eff[type][chk] for type in types, chk in checkpoints]

    # Format the values with best/worst highlighting
    formatted_matrix = format_best_worst_across_types(values_matrix, is_higher_better=false)

    # Create LaTeX table
    header = "\\hline\n & \$" * join(checkpoints, "\$ & \$") * "\$ & Length \\\\\n\\hline\n"
    rows = Dict()
    for (i, type) in enumerate(types)
        rows[type] = "$(latex_labels[type]) & " * join(formatted_matrix[i, :], " & ") * " & \$$(lengths[type])\$ \\\\\n"
    end

    rows_text = ""
    for ts in type_order
        for type in ts
            rows_text *= rows[type]
        end
        rows_text *= "\\hline\n"
    end
    return "\\begin{tabular}{|l|" * "r"^cols * "|r|}\n\\hline\n" * header * rows_text * "\\hline\n\\end{tabular}"
end

# Generate LaTeX table for asymptotic rewards
function generate_asymptotic_rewards_table(asymptotic_rewards, convergence_speeds, checkpoints, lengths)
    types = collect(keys(asymptotic_rewards))
    cols = length(checkpoints)
    
    # Collect convergence speeds into a matrix
    conv_matrix = [convergence_speeds[type][chk] for type in types, chk in checkpoints]
    asym_matrix = [asymptotic_rewards[type] for type in types, chk in [1]]

    # Format the values with best/worst highlighting
    formatted_conv = format_best_worst_across_types(conv_matrix, is_higher_better=false)
    formatted_asym = format_best_worst_across_types(asym_matrix, is_higher_better=true)
    percentages = ["\$$(Int(chk*100))\\%\$" for chk in checkpoints]
    # Create LaTeX table
    header = "\\hline\n & Asymptotic Reward & " * join(percentages, " & ") * " & Length \\\\\n\\hline\n"
    rows = Dict()
    for (i, type) in enumerate(types)
        rows[type] = "$(latex_labels[type]) & {$(formatted_asym[i,1])} & " * 
                     join(formatted_conv[i, :], " & ") * " & \$$(lengths[type])\$ \\\\\n"
    end
    rows_text = ""
    for ts in type_order
        for type in ts
            rows_text *= rows[type]
        end
        rows_text *= "\\hline\n"
    end
    return "\\begin{tabular}{|l|r|" * "r"^cols * "|r|}\n\\hline\n" * header * rows_text * "\\hline\n\\end{tabular}"
end




In [None]:
cumsums = get_cumsums(rewards)
sample_eff = get_sample_efficiency(rewards, [-0.6,-0.5,-0.4,-0.3,-0.2,-0.1])
asymptotic_rewards, convergence_speeds, lengths = get_asymptotic_reward(rewards, [0.6,0.7,0.8,0.9,0.99])


# Generate tables
cumulative_table = generate_cumulative_rewards_table(cumsums, [10,100,150,400,1000,3000])
sample_eff_table = generate_sample_efficiency_table(sample_eff, [-0.6,-0.5,-0.4,-0.3,-0.2,-0.1], lengths)
asymptotic_table = generate_asymptotic_rewards_table(asymptotic_rewards, convergence_speeds, [0.6,0.7,0.8,0.9,0.99], lengths)

# Print LaTeX tables
println(cumulative_table)

In [None]:

println(sample_eff_table)

In [None]:

println(asymptotic_table)