In [14]:
using Pkg; Pkg.activate("..")
# Pkg.instantiate()
using Distributions
import StatsBase.weights
using Random
using RCall
using ProgressBars
using GraphFusedLasso
using FileIO
using DataFrames, CSV
using Printf

R"library('tidyverse')"

# Random.seed!(418916);

In [19]:
function generate_trace(task, N)
    cuts = [N ÷ 3, 2(N ÷ 3)]
#     cuts = sort(sample(2:N-1, 2, replace=false))
    x1 = 1:cuts[1]
    x2 = (cuts[1] + 1):cuts[2]
    x3 = (cuts[2] + 1):N
    x = [x1; x2; x3]
    
    values1 = 2.0(rand(Uniform(), 4) .- 0.5)

    if task == "smooth"
        y1 = values1[1] .+ (values1[2] - values1[1]) .* (x1 ./ cuts[1])
        y2 = values1[2] .+ (values1[3] - values1[2]) .* (x2 .- cuts[1]) ./ (cuts[2] - cuts[1])
        y3 = values1[3] .+ (values1[4] - values1[3]) .* (x3 .- cuts[2]) ./ (N - cuts[2])
    elseif task == "constant"
        y1 = fill(values1[1], cuts[1])
        y2 = fill(values1[2], cuts[2] - cuts[1])
        y3 = fill(values1[3], N - cuts[2])
    elseif task == "mixed"
        y1 = values1[1] .+ Int(rand() < 0.5) .* (values1[2] - values1[1]) .* (x1 ./ cuts[1])
        y2 = values1[2] .+ Int(rand() < 0.5) .* (values1[3] - values1[2]) .* (x2 .- cuts[1]) ./ (cuts[2] - cuts[1])
        y3 = values1[3] .+ Int(rand() < 0.5) .* (values1[4] - values1[3]) .* (x3 .- cuts[2]) ./ (N - cuts[2])
    else
        throw(ArgumentError)
    end
    μ = [y1; y2; y3]
    return μ
end


function generate_spt_task(task_space, task_time, N, pmiss; σ=0.3, outliers=false)
    μ1s = generate_trace(task_space, N)
    μ2s = generate_trace(task_space, N)
    ts =  generate_trace(task_time, N)
    
    evalpts = collect(range(-2.5, 2.5, length=100))
    
    μs = [(t + μ1, t + μ2) for (μ1, μ2) in zip(μ1s, μ2s), t in ts]
    dmodels = [MixtureModel([Normal(μ1, σ), Normal(μ2, σ)]) for (μ1, μ2) in μs]

    devals = [pdf.(d, evalpts) for d in dmodels];
    ndata = [sample([0, 10], weights([pmiss, 1.0 - pmiss])) for d in dmodels]
    y = [rand(d, n) for (d, n) in zip(dmodels, ndata)]
    
    if outliers
        Nobs = sum([1 for n in ndata if n > 0])
        K = floor(Nobs * 0.5)
        idx = sample([i for (i, n) in enumerate(ndata) if n > 0], K, replace=false)
        for i in idx
            j = rand(1:length(y[i]))
            y[i][j] += rand([-1, 1]) * 5.0
        end
    end
                    
    # make matrix pts
    xrange = collect(1:N)
    # temporal
    ptr = Int[]
    brks = Int[1]
    for i in 1:N
        append!(ptr, xrange .+ (i - 1) * N)
        push!(brks, brks[end] + N)
    end
    istemporal = fill(true, N^2)
    # spatial
    xrange = [(i - 1) * N + 1 for i in 1:N]
    for i in 1:N
        append!(ptr, xrange .+ (i - 1))
        push!(brks, brks[end] + N)
    end
    append!(istemporal, fill(false, N^2))
    
    return Dict("evalpts" => evalpts,
                "dmodels" => dmodels,
                "devals" => devals,
                "y" => y,
                "ndata" => ndata,
                "mean1" => μ1s,
                "mean2" => μ2s,
                "t" => ts,
                "means" => μs,
                "ptr" => ptr,
                "brks" => brks,
                "istemporal" => istemporal)
end

# function for cross-validation fit

function generate_cvsets(y, nsplits)
    # make the cv splits
    N = length(y)
    cvsets = [Set{Int}() for i in 1:nsplits]
    iobs = shuffle([i for (i, yi) in enumerate(y) if !isempty(yi)])
    Nobs = length(iobs)
    splitsize = Nobs ÷ nsplits
    for k in 1:nsplits
        for i in ((k - 1) * splitsize + 1):(k * splitsize)
            push!(cvsets[k], iobs[i])
        end
    end
    return cvsets
end
               


function fit2(ytrain, ptr, brks, λ1, λ2, η1, η2, istemporal)
    N = length(ytrain)
    lambdasl1 = Float64[temp ? η1 : λ1 for temp in istemporal]
    lambdasl2 = Float64[temp ? η2 : λ2 for temp in istemporal]

    # create the tree
    M  = 33
    splits = collect(range(-2.5, 2.5, length=M))
    tree = DensityTree(splits)
    bins2counts = Dict()
    for (j, (li, ui)) in enumerate([tree.bins; [(i, i+1) for i in 1:M-1]])
        lower = splits[li]
        upper = splits[ui]
        k = [sum(lower .< yi .< upper) for yi in ytrain]
        bins2counts[(li, ui)] = k
    end
            
    # fit binomial model in each tree
    beta = zeros(N, M - 2)
    for j in 1:M - 2
        li, ui = tree.bins[j]
        level = Int(trunc(log2(j)))
        mi = (ui + li) ÷ 2 
        parent_counts = bins2counts[(li, ui)] .+ 0.1
        left_counts = bins2counts[(li, mi)]  .+ 0.05

        model = BinomialEnet(
            ptr, brks,
            lambdasl1 * (1 + level * sqrt(2)),
            lambdasl2 * (1 + level * sqrt(2));
            abstol=0.0, reltol=1e-4)
        fit!(model, left_counts, parent_counts; steps=1000)

        beta[:, j] = model.beta
    end
    tree.beta = beta
    return tree
end
         
                
function cv_fit2(y, evalpts, ptr, brks, istemporal, lambdas, cvsets, models)
    # for each cv split get the mse error
    N = length(y)
    nsplits = length(cvsets)
    nlambdas = length(lambdas)
    test_loglikelihood = zeros(nlambdas)
                    
    # prepare the tree structure and the bint 
    for (k, (λ1, λ2, η1, η2)) in enumerate(lambdas)
        for i in 1:nsplits
            # get the cv vector with missing data
            ytrain = [j in cvsets[i] ? Float64[] : yi
                      for (j, yi) in enumerate(y)]

            tree = fit2(ytrain, ptr, brks, λ1, λ2, η1, η2, istemporal)               
                            
            # compute the out-of-sample likelihood
            Ntest = 0.0
            loglikelihood = 0.0
            for j in collect(cvsets[i])
                test_eval = y[j]
                Ntest += length(test_eval)
                ll = log.(predict(tree, sort(test_eval), j) .+ 1e-12)
                loglikelihood += sum(ll)
            end
            loglikelihood /= Ntest
                             
            test_loglikelihood[k] += loglikelihood / nsplits
            dhats = predict(tree, evalpts)
        end
    end

    # now choose the best lambdas
    best_lambdas = lambdas[argmax(test_loglikelihood)]
    best_loglikelihood = maximum(test_loglikelihood)
    
    # compute validation likelihood
    nsims = 100
    samples = [rand(model, nsims) for model in models]
    lls = [mean(log.(predict(tree, sort(x), j)) .+ 1e-12) for (j, x) in enumerate(samples)]
    validation_loglikelihood = mean(lls)

    return Dict("best_lambdas" => best_lambdas,
                "cv_loglikelihood" => best_loglikelihood,
                "val_loglikelihood" => validation_loglikelihood)
end
            
function get_hypers(method)
    lambdas_dict = Dict(
        "fl" => [(l, 1e-12) for l in range(1e-12, 3.0, length=20)],
        "kal" => [(1e-12, 10.0^x) for x in range(-3.0, 1.5, length=20)],
        "enet" => [(l1, 10.0^x) for l1 in range(1e-12, 3.0, length=10)
                                for x in range(-3.0, 1.5, length=10)]) 
    ls = lambdas_dict[method]
    hypers = [(λ1, λ2, η1, η2) for (λ1, λ2) in ls for (η1, η2) in ls]
    if method == "enet"
        M = 400
    else
        M = 100
    end
    return rand(hypers, M)
end
           
function run_benchmarks(N, pmiss;
                        nsims=100,
                        nsplits=5,
                        tasks=("constant", "smooth", "mixed"))
    experiment_results = []
    for task_space in tasks
        for task_time in tasks
            if task_space == "mixed" && task_time == "mixed"
                outliers = true
            else
                outliers = false
            end
            data = [
                generate_spt_task(task_space, task_time, N, pmiss, outliers=outliers)
                for _ in 1:nsims
            ]
            for method in ("fl", "kal", "enet")
                println("Running task_space $task_space task_time $task_time for method $method")
                
                lambdas = get_hypers(method)
                for (l, D) in ProgressBar(enumerate(data))
                    y = vec(D["y"])
                    models = vec(D["dmodels"])
                    ndata = vec(D["ndata"])
                    devals = vec(D["devals"])
                    ptr = D["ptr"]
                    brks = D["brks"]
                    evalpts = D["evalpts"]
                    istemporal = D["istemporal"]
                    
                    cvsets = generate_cvsets(y, nsplits)

                    results = cv_fit2(y, evalpts, ptr, brks, istemporal, lambdas, cvsets, models)

                    new_result = Dict(
                        :experiment => l,
                        :task_space => task_space,
                        :task_time => task_time,
                        :method => method,
                        :cv_loglikelihood => results["cv_loglikelihood"],
                        :val_loglikelihood => results["val_loglikelihood"])
                    push!(experiment_results, new_result)
                end
            end
        end
    end
    return experiment_results
end


run_benchmarks (generic function with 1 method)

In [18]:
N = 30
pmiss = 0.1
nsims = 1
tasks = ("smooth", "constant", "mixed")

experiment_results = run_benchmarks(N, pmiss, nsims=nsims, tasks=tasks)

Running task_space smooth task_time smooth for method fl


InexactError: InexactError: Int64(1.584962500721156)

In [4]:
df = DataFrame(experiment = Int[],
               task=String[],
               method=String[],
               likelihood=Float64[],
               rmise=Float64[],
               miae=Float64[])
for record in experiment_results
    push!(df, record)
end

UndefVarError: UndefVarError: experiment_results not defined

In [5]:
head(df, 10)

Unnamed: 0_level_0,experiment,task,method,likelihood,rmise,miae
Unnamed: 0_level_1,Int64,String,String,Float64,Float64,Float64


In [None]:
R"""
df = $df %>% 
    group_by(task, method) %>%
    summarize(likelihood_mean = mean(rmise))
print(df)
"""

# A tibble: 0 x 3
# Groups:   task [0]
# … with 3 variables: task <chr>, method <chr>, likelihood_mean <dbl>


In [None]:
CSV.write("benchmarks-results-3.csv", df)

In [7]:
N=30
pmiss = 0.8
experiment = generate_spt_task("smooth", "mixed", N, pmiss; σ=0.3, outliers=false)

Dict{String,Array} with 12 entries:
  "mean2"      => [-0.25296, -0.125821, 0.00131861, 0.128458, 0.255598, 0.38273…
  "mean1"      => [-0.647122, -0.6495, -0.651879, -0.654257, -0.656636, -0.6590…
  "dmodels"    => MixtureModel{Univariate,Continuous,Normal{Float64}}[MixtureMo…
  "t"          => [-0.660662, -0.552596, -0.44453, -0.336464, -0.228398, -0.120…
  "ndata"      => [0 10 … 0 0; 0 10 … 0 0; … ; 0 0 … 0 10; 0 0 … 0 10]
  "y"          => Array{Float64,1}[[] [-1.10808, -0.637243, -1.39479, -1.48829,…
  "devals"     => Array{Float64,1}[[0.000247922, 0.000477488, 0.000894049, 0.00…
  "ptr"        => [1, 2, 3, 4, 5, 6, 7, 8, 9, 10  …  630, 660, 690, 720, 750, 7…
  "istemporal" => Bool[true, true, true, true, true, true, true, true, true, tr…
  "evalpts"    => [-2.5, -2.44949, -2.39899, -2.34848, -2.29798, -2.24747, -2.1…
  "brks"       => [1, 31, 61, 91, 121, 151, 181, 211, 241, 271  …  1531, 1561, …
  "means"      => Tuple{Float64,Float64}[(-1.30778, -0.913622) (-1.19972, -0.80…