In [1]:
using HDF5
using DataFrames
using Statistics
using Plots
using Measures

include("/home/mw894/diss/gnm/gnm_utils.jl")

datasets = [
    "/store/DAMTPEGLEN/mw894/data/Charlesworth2015/ctx",
    "/store/DAMTPEGLEN/mw894/data/Charlesworth2015/hpc",
    "/store/DAMTPEGLEN/mw894/data/Demas2006"
]

3-element Vector{String}:
 "/store/DAMTPEGLEN/mw894/data/Charlesworth2015/ctx"
 "/store/DAMTPEGLEN/mw894/data/Charlesworth2015/hpc"
 "/store/DAMTPEGLEN/mw894/data/Demas2006"

# Create the results dataset

In [2]:
df_all = DataFrame[]
for in_dir in datasets
    res_files = filter(name -> endswith(name, ".res"), readdir(in_dir))
    res_files = map(name -> joinpath(in_dir, name), res_files)
    res_files

    df_dataset = DataFrame[]

    # Each result stores for one sample and one model
    for (i_res_files, res_file) in enumerate(res_files)
        file = h5open(res_file, "r")
        df_file = DataFrame()

        # read meta data for this sample model combi
        meta_group = file["meta"]
        div = read_attribute(meta_group, "group_id")
        sample_name = read_attribute(meta_group, "org_file_name")
        data_set_name = read_attribute(meta_group, "data_set_name")
        model_id = read_attribute(meta_group, "model_id")
        week = min(4, ceil.(Int, parse(Int, div) / 7))

        # read the data for this sample model combi
        K = read(file, "K")

        # read the paramter space
        param_space = read(file, "param_space")
        n_rows = size(param_space, 1)
        close(file)

        # store metadata for df
        df_file.model_id = repeat([model_id], n_rows)
        df_file.sample_name = repeat([sample_name], n_rows)
        df_file.div = repeat([div], n_rows)
        df_file.week = repeat([week], n_rows)
        df_file.data_set = repeat([data_set_name], n_rows)
        df_file.eta = param_space[:, 1]
        df_file.gamma = param_space[:, 2]
        df_file.KS_K = K[:, 1]
        df_file.KS_C = K[:, 2]
        df_file.KS_B = K[:, 3]
        df_file.KS_E = K[:, 4]

        push!(df_dataset, df_file)
    end

    df_dataset = vcat(df_dataset...)
    df_dataset.KS_MAX = map(maximum, eachrow(df_dataset[!, ["KS_B", "KS_C", "KS_E", "KS_K"]]));

    push!(df_all, df_dataset)
end

df_all = vcat(df_all...);

In [3]:
cols = [:model_id, :sample_name, :eta, :gamma]
size(df_all[findall(nonunique(df_all[!,cols])), :])

(0, 12)

## Best performing model across parameter combinations and anverafed across samples

In [4]:
plots = []
dset_names = []
for dset in unique(df_all.data_set)
    push!(dset_names, dset)

    # get dataset df
    df_dataset = df_all[df_all.data_set .== dset, :]

    # Get best performing parameter combination 
    top_model_sample_combs = combine(groupby(df_dataset, [:model_id, :sample_name]), :KS_MAX => minimum => :KS_MAX_best, :week => first => :week)

    # Compute the average across all samples
    avg_top_model_sample_combs = combine(groupby(top_model_sample_combs, [:model_id, :week]), :KS_MAX_best => mean => :KS_MAX_best_mean)

    # data for plot
    p_data = reshape(avg_top_model_sample_combs.KS_MAX_best_mean, 
     length(unique(avg_top_model_sample_combs.week)), 
     length(unique(avg_top_model_sample_combs.model_id)))'


    p = heatmap(p_data, yticks=(1:13, values(MODELS)), interpolate = false, c=:viridis, xticks=(1:4), xrotation = 45, fill_z=p_data, fmt=:pdf)

    # annotate
    nrow, ncol = size(p_data)
    fontsize = 10
    ann = [(j,i, text(round(p_data[i,j], digits=3), fontsize, :white, :center)) for i in 1:nrow for j in 1:ncol]
    annotate!(ann, linecolor=:white)

    push!(plots, p)
end


p = plot(plots...; format = grid(4, 4), fmt=:pdf, size = (1500, 1500), margin=5mm, title=["1" "2" "3" "4"]) 
savefig(p, "top_scores.png")

"/mhome/damtp/r/mw894/diss/gnm/analysis/top_scores.png"

In [5]:
top_model_sample_combs = combine(groupby(df_all, [:data_set, :model_id, :sample_name]), :KS_MAX => minimum => :KS_MAX_best, :week => first => :week)
avg_top_model_sample_combs = combine(groupby(top_model_sample_combs, [:data_set, :model_id, :week]), :KS_MAX_best => mean => :KS_MAX_best_mean)
avg_top_model_sample_combs;

## Heatmaps

In [6]:
for dset in unique(df_all.data_set)
    for week in unique(df_all.week)
        # prep plots and names
        plots = []
        dset_names = []

        for (model_id, model_name) in MODELS
            # get model df
            df_subset = filter(r-> (r.model_id == model_id) && (r.week == week) && (r.data_set == dset), df_all)

            # average across samples
            plot_data = combine(groupby(df_subset, [:eta, :gamma]), :KS_MAX => mean)

            # data for plot
            landscape = reshape(plot_data.KS_MAX_mean, (length(unique(plot_data.eta)), length(unique(plot_data.gamma))))
            p = heatmap(landscape, clim=(0, 1), c=:viridis, legend=:none, title="Model " * string(model_name), xticks=:none, yticks=:none)
            yflip!(true)
            push!(plots, p)
        end
        
        # combine and add bar
        push!(plots, plot())
        push!(plots, plot())
        push!(plots, plot())
        l = @layout[grid(4, 4) a{0.05w}]
        bar = heatmap((0:0.01:1) .* ones(101, 1), legend=:none, xticks=:none, yticks=(1:10:101, string.(0:0.1:1)), c=:viridis)
        p = plot(plots..., bar, layout=l, size=(2000, 1600))

        savefig(p, replace(dset, "/" => "_") * "_" * string(week) * "_heatmaps.png")
    end
end

ArgumentError: ArgumentError: range must be non-empty