In [None]:
## Intro

This is inspired by 
Article (likas2001probability) Likas, A. Probability density estimation using artificial neural networks Computer physics communications, Elsevier, 2001, 135, 167-175

But rather than estimating the working with a network, we will instead work with its derivitive.
This will let us replace their integration with a derivative.

Note that this method only works for compact supports



They use the PDF is given by $$p_h(x,p) = \dfrac{h(x,p)}{\int_S h(z,p) dz}$$
and in their case $h=N(x,p)$  a neural network with weight and bias parameters $p$.
Where $S$ is a compact support. (That means bounded)


But if instead we say $h=\frac{\partial N(x,p)}{\partial x}$,

then $$p_h(x,p) = \dfrac{h(x,p)}{\int_S h(z,p)}=\dfrac{\frac{\partial N(x,p)}{\partial x}}{N(max(S),p) - N(min(S), p)}$$

The denominator is ofcourse more complex for non-1D values of S.


The loss function given is the negative log-likelihood of the set of training samples $X$
$$L(p) = -\sum_{\forall x \in X} ln(h(x,p))  + |X| ln(\int_S h(z,p) dx)$$

Which befomes:

$$L(p) = -\sum_{\forall x \in X} log(\frac{\partial N(x,p)}{\partial x})  + |X|(ln(N(max(S),p)-N(min(S),p)) dx$$

In [1]:
using StatsBase
using Distributions
using Plots

In [2]:
using TensorFlow
using MLDataUtils

In [3]:
using DensityEstimationML

In [4]:
immutable NeuralDensityEstimator
    sess::Session
    
    #Network nodes
    optimizer::Tensor
    conditioner::Tensor
    t::Tensor
    pdf::Tensor
end

In [5]:
function Distributions.pdf(est::NeuralDensityEstimator, t::Real)
    ts = reshape([t], (1,1))
    pdf(est, ts) |> first
end

function Distributions.pdf(est::NeuralDensityEstimator, ts::AbstractVector)
    gr = est.sess.graph
    run(est.sess, est.pdf, Dict(est.t=>ts')) |> vec
end

function Distributions.loglikelihood(est::NeuralDensityEstimator, ts::AbstractVector)
    gr = est.sess.graph
    run(est.sess, gr["loglikelihood"], Dict(est.t=>ts')) |> vec
end


In [6]:
X = [20.0, 25.0, 27.0, 28.0]
Y = [30.0, 30.0, 24, 27.]
plot(X,Y)
push!(X, 25)


5-element Array{Float64,1}:
 20.0
 25.0
 27.0
 28.0
 25.0

In [7]:
ignore(epoch, arg...) = nothing

function StatsBase.fit!(estimator::NeuralDensityEstimator, observations, callback=ignore;
    epochs = 20)
    
    gr = estimator.sess.graph
    for ii in 1:epochs
        outs = run(estimator.sess, 
            [gr["ysmin"],gr["ysmax"],gr["loglikelihood"], gr["working_loss"],
                estimator.optimizer],
            Dict(estimator.t=>observations'))
        ii % 100 == 1 && callback(ii, outs...)
    end
    estimator
end

In [8]:
function NeuralDensityEstimator(prob_layer_sizes, support)
    sess = Session(Graph())
    @tf begin
        t = placeholder(Float32, shape=[1, -1])
        smin = constant(reshape([minimum(support)],(1,1)))
        smax = constant(reshape([maximum(support)],(1,1)))
        
        layer_sizes= [1; prob_layer_sizes; 1]
        
        network_fun_stack = Function[Base.identity]       
       
        for ii in 2:length(layer_sizes)
            below_size = layer_sizes[ii-1]
            above_size = layer_sizes[ii]
                       
            Wii = get_variable("W_$ii", [above_size, below_size], Float32)
            Wii2  = Ops.mul(Wii, Wii; name = "W_$(ii)_squared")
            
            #bii = get_variable("b_$ii", [above_size, 1], Float32)
            #act_fun = z -> nn.sigmoid(Wii2*z .+ bii)
            
            act_fun = if ii!=length(layer_sizes)
                bii = get_variable("b_$ii", [above_size, 1], Float32)
                z -> nn.sigmoid(Wii2*z .+ bii)
            else
                z-> exp(Wii2*z)
            end
            push!(network_fun_stack, z->act_fun(network_fun_stack[ii-1](z)))
        end
        
        network = network_fun_stack[end]

        
        ysmin = TensorFlow.identity(network(smin))
        ysmax = TensorFlow.identity(network(smax))
        yt = network(t)
        
        denominator = (ysmax-ysmin) #area
        numerator = gradients(yt,t)
        pdf = numerator/denominator
        
        
        n_points = TensorFlow.shape(t)[2]
        loglikelihood = reduce_sum(log(numerator)) - n_points.*log(denominator)
        
        area_loss = (1f0.-denominator)^2
        working_loss = -1*loglikelihood + 0.1*area_loss
        
        optimizer = train.minimize(train.AdamOptimizer(), working_loss)
        
        
        # Conditioning
        # Make sure that ysmin~=1, and ysmax~=2
        condition_loss = (1f0 - ysmin)^2 + (2f0 - ysmax)^2
        condition_optimiser = train.minimize(train.AdamOptimizer(;name="adam_cond"), condition_loss)
    end
    
    run(sess, global_variables_initializer())
    
    NeuralDensityEstimator(sess, optimizer, condition_optimiser, t, pdf)
end

NeuralDensityEstimator

In [9]:
"""
    condition(est::NeuralDensityEstimator tol = 1e-15, max_epochs=2_000)
    
"Conditions" the neural density estimate so the support extrema are mapped to 1. and 2.
This improves training by adjusting the area the network has the learn over

"""
function condition!(est::NeuralDensityEstimator, tol = 1e-15, max_epochs=2_000)
    gr = est.sess.graph
    for ii in 1:2_000
        _, ysmin, ysmax, condition_loss = run(est.sess, [est.conditioner, gr["ysmin"],gr["ysmax"], gr["condition_loss"]])
        ii % 50 == 1 && @show (ii, ysmin, ysmax, condition_loss)
        if condition_loss[1] < 1e-15
            break
        end
    end
end


condition!

In [10]:
function demonstration_plot(est, dataset, data=rand(dataset), args...; kwargs...)
    X = minimum(approximate_support(dataset)) : 0.01 : maximum(approximate_support(dataset))
    println("True loglikelihood      = $(loglikelihood(dataset, data))")
    println("Estimated loglikelihood = $(loglikelihood(est, data))")
    estimated_loglikelihood = loglikelihood(dataset, data)
    plot([X], [pdf(est,X), data],
        #xlims= approximate_support(dataset),
        xlims= (first(X), last(X)),
        seriestype = [:path :histogram],
        layout=(2,1),
        legend=false,
        nbins=[1  length(data)÷10],
        args...; kwargs...
    )
end

demonstration_plot (generic function with 2 methods)

In [11]:
function demo(dataset, layers, epochs=20_000)
    data = original_sample(dataset)
    @show loglikelihood(dataset, data)
    est = NeuralDensityEstimator(layers, approximate_support(dataset))
    condition!(est)
    println("Conditioning Done")
    fit!(est, data; epochs=epochs)
    println("Fitting Done")
    demonstration_plot(est, dataset, data)
end

demo (generic function with 2 methods)

In [12]:
demo(GenerateDatasets.Likas1(), [64,64], 20_000)

loglikelihood(dataset, data) = -10516.625172431352


2017-09-15 13:19:30.323654: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2017-09-15 13:19:30.323686: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2017-09-15 13:19:30.323692: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.
2017-09-15 13:19:30.507198: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:893] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2017-09-15 13:19:30.507737: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with propert

(ii, ysmin, ysmax, condition_loss) = (1, [1.00271], [1.00271], [0.994586])
(ii, ysmin, ysmax, condition_loss) = (51, [1.16601], [1.16641], [0.722425])
(ii, ysmin, ysmax, condition_loss) = (101, [1.51874], [1.54791], [0.47348])
(ii, ysmin, ysmax, condition_loss) = (151, [1.49838], [1.61576], [0.396028])
(ii, ysmin, ysmax, condition_loss) = (201, [1.46333], [1.71975], [0.293216])
(ii, ysmin, ysmax, condition_loss) = (251, [1.42282], [1.79745], [0.219805])
(ii, ysmin, ysmax, condition_loss) = (301, [1.38944], [1.84106], [0.176923])
(ii, ysmin, ysmax, condition_loss) = (351, [1.36478], [1.86682], [0.150798])
(ii, ysmin, ysmax, condition_loss) = (401, [1.34577], [1.88398], [0.133016])
(ii, ysmin, ysmax, condition_loss) = (451, [1.33023], [1.89659], [0.119748])
(ii, ysmin, ysmax, condition_loss) = (501, [1.31692], [1.9065], [0.10918])
(ii, ysmin, ysmax, condition_loss) = (551, [1.3051], [1.91467], [0.100369])
(ii, ysmin, ysmax, condition_loss) = (601, [1.29434], [1.92165], [0.0927774])
(ii, 

In [None]:
demo(GenerateDatasets.Likas2(), [64,64], 20_000)

In [None]:
demo(GenerateDatasets.MagdonIsmailAndAtiya(), [32], 20_000)

In [None]:
demo(Arcsine(1,4), [64,64], 20_000)

In [None]:
dataset = GenerateDatasets.Likas1()
data = original_sample(dataset)
@show loglikelihood(dataset, data)
est = NeuralDensityEstimator([64], approximate_support(dataset))
condition!(est)
println("Conditioning Done")
fit!(est, data; epochs=10_000)
println("Fitting Done")

In [None]:
gr =est.sess.graph
run(est.sess, exp(6*gr["W_2"].^2 + gr["b_2"]))

In [None]:
collect(keys(est.sess.graph))