## Intro

This is inspired by 
Article (likas2001probability) Likas, A. Probability density estimation using artificial neural networks Computer physics communications, Elsevier, 2001, 135, 167-175

But rather than estimating the working with a network, we will instead work with its derivitive.
This will let us replace their integration with a derivative.

Note that this method only works for compact supports



they use the PDF is given by $$p_h(x,p) = \dfrac{h(x,p)}{\int_S h(z,p) dz}$$
and in their case $h=N(x,p)$  a neural network with weight and bias parameters $p$.
Where $S$ is a compact support. (That means bounded)


But if instead we say $h=\frac{\partial N(x,p)}{\partial x}$,

then $$p_h(x,p) = \dfrac{h(x,p)}{\int_S h(z,p)}=\dfrac{\frac{\partial N(x,p)}{\partial x}}{N(max(S),p) - N(min(S), p)}$$

The denominator is ofcourse more complex for non-1D values of S.


The loss function given is the negative log-likelihood of the set of training samples $X$
$$L(p) = -\sum_\forall{x \in X} ln(h(x,p))  + |X| ln(\int_S h(z,p) dx)$$

Which befomes:

$$L(p) = -\sum_\forall{x \in X} log(\frac{\partial N(x,p)}{\partial x})  + |X| ln(\int_S h(z,p)) dx$$

In [7]:
using StatsBase
using Distributions

In [2]:
using TensorFlow
using MLDataUtils

In [3]:
using DensityEstimationML
using Plots

using MacroTools
macro plot(ex)
    @capture(ex, (x_, [ys__], tail__ )) 
    labels = repr.(ys)
    ys_expr = Expr(:hvect, ys...)
    labels_expr = Expr(:vect, repr.(ys)...)
    Expr(:call, :plot, x, ys_expr, tail..., Expr(:kw, :labels, labels_expr))
end

@plot (X, [Z, logistic.(Z)])

In [4]:
immutable NeuralDensityEstimator
    sess::Session
    
    #Network nodes
    optimizer::Tensor
    t::Tensor
    pdf::Tensor
    cdf::Tensor
end

In [5]:
abstract type NdeModel; end
immutable NdeNaive <: NdeModel; end
immutable NdeAsymptoticForced <: NdeModel; end

In [None]:
leaky_relu6(z) = 0.001z + nn.relu6(z)

NeuralDensityEstimator(prob_layer_sizes) = NeuralDensityEstimator(prob_layer_sizes, NdeNaive())

function NeuralDensityEstimator(prob_layer_sizes, model::NdeNaive)
    sess = Session(Graph())
    @tf begin
        t1 = placeholder(Float32, shape=[1, -1])
        t2 = placeholder(Float32, shape=[1, -1])
        countprob_between = placeholder(Float32, shape=[-1])
        
        zp1 = [t1]
        zp2 = [t2]
        layer_sizes= [1; prob_layer_sizes; 1]
        act_funs = Vector{Function}(length(layer_sizes)-1)
        act_funs[:] = nn.relu6
        act_funs[end] = nn.sigmoid
        for ii in 2:length(layer_sizes)
            below_size = layer_sizes[ii-1]
            above_size = layer_sizes[ii]
            
            act_fun = act_funs[ii-1]
            
            Wii = get_variable("W_$ii", [above_size, below_size], Float32)
            bii = get_variable("b_$ii", [above_size, 1], Float32)
            
            push!(zp1, act_fun(Wii*zp1[end] .+ bii))
            push!(zp2, act_fun(Wii*zp2[end] .+ bii))
        end
        
        cdf1 = zp1[end]
        cdf2 = zp2[end]
        #Assumes t2>t1
        cdf_between = cdf2 - cdf1
                
        
        losses = 0.5.*(cdf_between .- countprob_between).^2
        
        loss=reduce_mean(losses; axis=2)
        optimizer = train.minimize(train.AdamOptimizer(), loss)
        
        pdf1 = gradients(cdf1, t1)
    end
    
    run(sess, global_variables_initializer())
    
    NeuralDensityEstimator(sess, optimizer, t1, pdf1, cdf1)
end

In [None]:
tanh.(-1000:1000)

In [None]:
plot(x->logistic(x*logistic(10x-2)), xlims=(-10, 10))

In [None]:
function NeuralDensityEstimator(prob_layer_sizes, model::NdeAsymptoticForced)
    sess = Session(Graph())
    @tf begin
        t1 = placeholder(Float32, shape=[1, -1])
        t2 = placeholder(Float32, shape=[1, -1])
        countprob_between = placeholder(Float32, shape=[-1])
        
        zp1 = [t1]
        zp2 = [t2]
        layer_sizes= [1; prob_layer_sizes; 1]
        act_funs = Vector{Function}(length(layer_sizes)-1)
        act_funs[:] = nn.relu6
        act_funs[end] = TensorFlow.identity
        for ii in 2:length(layer_sizes)
            below_size = layer_sizes[ii-1]
            above_size = layer_sizes[ii]
            
            act_fun = act_funs[ii-1]
            
            Wii = get_variable("W_$ii", [above_size, below_size], Float32)
            bii = get_variable("b_$ii", [above_size, 1], Float32)
            
            push!(zp1, act_fun(Wii*zp1[end] .+ bii))
            push!(zp2, act_fun(Wii*zp2[end] .+ bii))
        end
        
        #forces assymtoptess    
        s_shift = get_variable([1], Float32) #support translate
        s_scale = get_variable([1], Float32) #support scale
        cft_fun(z, t) = nn.sigmoid(2(s_scale*t + s_shift)) + tanh(s_scale*t + s_shift)^2 * z
                    
        cdf1 = cft_fun(zp1[end], t1)
        cdf2 = cft_fun(zp2[end], t2)
        #Assumes t2>t1
        cdf_between = cdf2 - cdf1
                
        
        losses = 0.5.*(cdf_between .- countprob_between).^2
        
        loss=reduce_mean(losses; axis=2)
        optimizer = train.minimize(train.AdamOptimizer(), loss)
        
        pdf1 = gradients(cdf1, t1)
    end
    
    run(sess, global_variables_initializer())
    
    NeuralDensityEstimator(sess, optimizer, t1, pdf1, cdf1)
end

In [8]:
using StatsFuns

In [75]:
X=-2:0.01:2
Z=(0.4X-tanh.(X).^2)
@plot (X, [Z, logistic.(Z), Z.*Z, logistic.(Z.*Z)])

Stacktrace:
 [1] [1mdepwarn[22m[22m[1m([22m[22m::String, ::Symbol[1m)[22m[22m at [1m./deprecated.jl:70[22m[22m
 [2] [1m(::Base.##716#717)[22m[22m[1m([22m[22m::Array{Float64,1}, ::Array{Float64,1}[1m)[22m[22m at [1m./deprecated.jl:346[22m[22m
 [3] [1minclude_string[22m[22m[1m([22m[22m::String, ::String[1m)[22m[22m at [1m./loading.jl:515[22m[22m
 [4] [1minclude_string[22m[22m[1m([22m[22m::Module, ::String, ::String[1m)[22m[22m at [1m/home/uniwa/students2/students/20361362/linux/.julia/v0.6/Compat/src/Compat.jl:577[22m[22m
 [5] [1mexecute_request[22m[22m[1m([22m[22m::ZMQ.Socket, ::IJulia.Msg[1m)[22m[22m at [1m/home/uniwa/students2/students/20361362/linux/.julia/v0.6/IJulia/src/execute_request.jl:154[22m[22m
 [6] [1meventloop[22m[22m[1m([22m[22m::ZMQ.Socket[1m)[22m[22m at [1m/home/uniwa/students2/students/20361362/linux/.julia/v0.6/IJulia/src/eventloop.jl:8[22m[22m
 [7] [1m(::IJulia.##14#17)[22m[22m[1m([22m[22m[

In [83]:
plot(X,log(cosh(X)))


Stacktrace:
 [1] [1mdepwarn[22m[22m[1m([22m[22m::String, ::Symbol[1m)[22m[22m at [1m./deprecated.jl:70[22m[22m
 [2] [1mcosh[22m[22m[1m([22m[22m::StepRangeLen{Float64,Base.TwicePrecision{Float64},Base.TwicePrecision{Float64}}[1m)[22m[22m at [1m./deprecated.jl:57[22m[22m
 [3] [1minclude_string[22m[22m[1m([22m[22m::String, ::String[1m)[22m[22m at [1m./loading.jl:515[22m[22m
 [4] [1minclude_string[22m[22m[1m([22m[22m::Module, ::String, ::String[1m)[22m[22m at [1m/home/uniwa/students2/students/20361362/linux/.julia/v0.6/Compat/src/Compat.jl:577[22m[22m
 [5] [1mexecute_request[22m[22m[1m([22m[22m::ZMQ.Socket, ::IJulia.Msg[1m)[22m[22m at [1m/home/uniwa/students2/students/20361362/linux/.julia/v0.6/IJulia/src/execute_request.jl:154[22m[22m
 [6] [1meventloop[22m[22m[1m([22m[22m::ZMQ.Socket[1m)[22m[22m at [1m/home/uniwa/students2/students/20361362/linux/.julia/v0.6/IJulia/src/eventloop.jl:8[22m[22m
 [7] [1m(::IJulia.##14#17

In [79]:
Z

401-element Array{Float64,1}:
 -1.72935 
 -1.72397 
 -1.71857 
 -1.71315 
 -1.70769 
 -1.70221 
 -1.69671 
 -1.69117 
 -1.68561 
 -1.68001 
 -1.67439 
 -1.66874 
 -1.66306 
  ⋮       
 -0.156738
 -0.15439 
 -0.152013
 -0.149606
 -0.14717 
 -0.144706
 -0.142213
 -0.139694
 -0.137147
 -0.134574
 -0.131974
 -0.129349

In [None]:
aforce(f,t) = (tanh(t)^2)*f(t)

plot(t->aforce(logistic, t))

In [None]:
function Distributions.cdf(est::NeuralDensityEstimator, t::Real)
    gr = est.sess.graph
    ts = reshape([t], (1,1))
    run(est.sess, est.cdf, Dict(est.t=>ts))[1]
end

function Distributions.pdf(est::NeuralDensityEstimator, t::Real)
    gr = est.sess.graph
    ts = reshape([t], (1,1))
    run(est.sess, est.pdf, Dict(est.t=>ts))[1]
end



In [None]:
"""
Returns 3 vectors.
A vector of start points \$t1\$
A vector of end points \$t2\$
A vector of the counts of elements \$ti\$ between them, such that \$\left{t_i \mid t1 \le ti < t2 \right}\$
"""
function get_cdf_training_pairs(observations)
    observations = sort(observations)
    
    t1s = Float32[]
    t2s = Float32[]
    probs = Float32[]
    for (ii, t1) in enumerate(observations)
        for jj in ii+1 : length(observations)
            t2 = observations[jj]
            count = jj - ii           
            push!(t1s, t1)
            push!(t2s, t2)
            push!(probs, count/length(observations))
        end
    end
    
    t1s, t2s, probs
end

In [None]:
function StatsBase.fit!(estimator::NeuralDensityEstimator, observations;
    epochs = 20,
    batch_size = 1024)
    gr = estimator.sess.graph
    
    for ii in 1:epochs
        batch_losses = Float32[]
        for (t1_o, t2_o, probs_o) = eachbatch(shuffleobs(get_cdf_training_pairs(observations)), batch_size)
            loss_o, _, = run(estimator.sess, 
                [gr["loss"], estimator.optimizer],
                Dict(gr["t1"]=>t1_o', gr["t2"]=>t2_o', gr["countprob_between"]=>probs_o))
            push!(batch_losses, loss_o[1])
        end
        epoch_loss = mean(batch_losses)
        println("Epoch $ii: loss: $(epoch_loss)")
    end
    estimator
end

In [None]:
est = NeuralDensityEstimator([64, 64])
data = GenerateDatasets.magdon_ismail_and_atiya();


In [None]:
fit!(est, data; epochs=100)

In [None]:
histogram(data)

In [None]:
X=-50:0.1:100

plot(X, pdf.(est, X))

In [None]:
empirical_cdf(data, X) = [length(filter(i->i<x, data)) for x in X]./length(data)

In [None]:
plot(X, [cdf.(est, X), empirical_cdf(data, X)] , label=["Estimated" "Empirical"])

In [None]:
2π*sqrt(0.296/9.8)

In [None]:
2π*sqrt(0.296/11.7)