In [1]:
function only(itr)
    state = start(itr)
    val,state = next(itr, state)
    @assert(done(itr,state))
    return val
end

only (generic function with 1 method)

In [2]:
using SymEngine
import SymEngine: as_numer_denom
using Expr2LaTeX
Expr2LaTeX.render(symexpr::SymEngine.Basic)=symexpr |> repr |> parse |> render

In [205]:
function ugly_simplify(f)
    num,den = as_numer_denom(f)
    inv(expand(inv(num)*den))
end

function basic_simplify(f)
    num2,den2 = as_numer_denom(ugly_simplify(f))
    num2/den2
end

(a)≖(b) = ugly_simplify(a)==ugly_simplify(b)

### Tests
using Base.Test
@vars z
s1 = (1 + exp(-z))^(-1)
s2 = exp(z)/(1+exp(z))

@test s1 ≖ s2
@test inv(s1)≖inv(s2)

n1 = subs(s1, z=>-z)
n2 = 1-s1
n3 = subs(s2, z=>-z)
n4 = 1-s2
@test n1 ≖ n2 ≖ n3 ≖ n4
@test inv(n1)≖inv(n2) ≖ inv(n3) ≖ inv(n4)


[1m[32mTest Passed
[39m[22m

In [182]:
sigmoid(z)=(1 + exp(-z))^(-1)
@vars z
diff(sigmoid(z),z)

exp(-z)/(1 + exp(-z))^2

In [5]:
function weight_var(name, input_size, output_size)
    [symbols("$(name)_$j$i") for j in 1:output_size, i in 1:input_size]
end
function bias_var(name, output_size)
    [symbols("$(name)_$j") for j in 1:output_size]
end



bias_var (generic function with 1 method)

## Basic Network

Single input single output
1 hidden layer

In [99]:
@vars x
W = weight_var("W", 1, 3)
b = bias_var("b",3)
V = weight_var("V", 3, 1)

z=(sigmoid.(W*x + b))
N=V*z |> only
render(N)

L"$(\frac{V_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})$"

In [96]:
dNdx = diff(N,x)
render(dNdx)

L"$(\frac{(\mathrm{exp}(-(b_{1} + (x * W_{11}))) * W_{11} * V_{11})}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))^{2}} + \frac{(\mathrm{exp}(-(b_{2} + (x * W_{21}))) * W_{21} * V_{12})}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))^{2}} + \frac{(\mathrm{exp}(-(b_{3} + (x * W_{31}))) * W_{31} * V_{13})}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))^{2}})$"

In [171]:
# I am pretty sure this is the right equation for the derivative of a plain nerual network
# But symengine is not strong enough to  recognise that it matches the above
mdNdx = (W'.*V)*(sigmoid.(-(W*x + b)).*sigmoid.(W*x + b)) |> first
render.(mdNdx)

L"$(\frac{(W_{11} * V_{11})}{((1 + \mathrm{exp}(b_{1} + (x * W_{11}))) * (1 + \mathrm{exp}(-(b_{1} + (x * W_{11})))))} + \frac{(W_{21} * V_{12})}{((1 + \mathrm{exp}(b_{2} + (x * W_{21}))) * (1 + \mathrm{exp}(-(b_{2} + (x * W_{21})))))} + \frac{(W_{31} * V_{13})}{((1 + \mathrm{exp}(b_{3} + (x * W_{31}))) * (1 + \mathrm{exp}(-(b_{3} + (x * W_{31})))))})$"

In [169]:
W'.*V

1×3 Array{SymEngine.Basic,2}:
 W_11*V_11  W_21*V_12  W_31*V_13

## Expodential Output net
Single input single output still,
1 hidden layer,
output expodential,

In [101]:
@vars x
W = weight_var("W", 1, 3)
b = bias_var("b",3)
V = weight_var("V", 3, 1)

z=sigmoid.(W*x + b)
N=exp.(V*z) |> only
render(N)

L"$\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})$"

In [102]:
dNdx = diff(N,x)
render(dNdx)

L"$(\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))}) * (\frac{(\mathrm{exp}(-(b_{1} + (x * W_{11}))) * W_{11} * V_{11})}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))^{2}} + \frac{(\mathrm{exp}(-(b_{2} + (x * W_{21}))) * W_{21} * V_{12})}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))^{2}} + \frac{(\mathrm{exp}(-(b_{3} + (x * W_{31}))) * W_{31} * V_{13})}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))^{2}}))$"

In [103]:
# I am pretty sure this is the right equation for the derivative of this network
# as we just use the standard `d(exp(f(x))/dx = f'(x)*exp(f(x))` rule
$
# But symengine is not strong enough to  recognise that it matches the above
mdZm1dx = (W'.*V)*(sigmoid.(-(W*x + b)).*sigmoid.(W*x + b)) |> first
mdNdx = mdZm1dx*N
render.(mdNdx)

L"$(\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))}) * (\frac{(W_{11} * V_{11})}{((1 + \mathrm{exp}(b_{1} + (x * W_{11}))) * (1 + \mathrm{exp}(-(b_{1} + (x * W_{11})))))} + \frac{(W_{21} * V_{12})}{((1 + \mathrm{exp}(b_{2} + (x * W_{21}))) * (1 + \mathrm{exp}(-(b_{2} + (x * W_{21})))))} + \frac{(W_{31} * V_{13})}{((1 + \mathrm{exp}(b_{3} + (x * W_{31}))) * (1 + \mathrm{exp}(-(b_{3} + (x * W_{31})))))}))$"

## Deeper Expodential Output net
Single input single output still,
2 hidden layer,
output expodential,

In [105]:
@vars x
W = weight_var("W", 1, 3)
b = bias_var("b",3)
U = weight_var("U", 3, 3)
c = bias_var("c",3)
V = weight_var("V", 3, 1)

z1=sigmoid.(W*x + b)
z2=sigmoid.(U*z1 + c)
N=exp.(V*z2) |> only
render(N)

L"$\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(c_{1} + \frac{U_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(c_{2} + \frac{U_{21}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{22}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{23}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(c_{3} + \frac{U_{31}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{32}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{33}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))})$"

In [106]:
dNdx = diff(N,x)
render(dNdx)

L"$(\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(c_{1} + \frac{U_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(c_{2} + \frac{U_{21}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{22}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{23}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(c_{3} + \frac{U_{31}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{32}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{33}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))}) * (\frac{(\mathrm{exp}(-(c_{1} + \frac{U_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})) * V_{11} * (\frac{(\mathrm{exp}(-(b_{1} + (x * W_{11}))) * W_{11} * U_{11})}{(1 +

In [107]:
# TODO workout the clean form of this

### Find the parts that count to the gradient

In [163]:
using MacroTools: postwalk
tree = parse(repr(dNdx))

tree = postwalk(x -> @capture(x, exp(z_)) ? 1 : x, tree)
tree = postwalk(x -> @capture(x, (1+1)^2) ? 1 : x, tree)
tree = postwalk(x -> @capture(x, z_/1) ? z : x, tree)
tree = postwalk(x -> @capture(x, 1*z_) ? z : x, tree)
tree = postwalk(x -> @capture(x, 1*z_*y_) ? :($z*$y) : x, tree)
tree |> render

L"$((V_{11} * ((W_{11} * U_{11}) + (W_{21} * U_{12}) + (W_{31} * U_{13}))) + (V_{12} * ((W_{11} * U_{21}) + (W_{21} * U_{22}) + (W_{31} * U_{23}))) + (V_{13} * ((W_{11} * U_{31}) + (W_{21} * U_{32}) + (W_{31} * U_{33}))))$"

In [165]:
V*(U*W )|> only

V_11*(W_11*U_11 + W_21*U_12 + W_31*U_13) + V_12*(W_11*U_21 + W_21*U_22 + W_31*U_23) + V_13*(W_11*U_31 + W_21*U_32 + W_31*U_33)

Note that each product term is scaled with a sigmoidal term that varies both with the value of the weights in that term and te corresponding biases, as well as with $x$

# Proof of Ability to Approximate any PDF


In [207]:
@vars z
@show diff(sigmoid(z),z) ≖ sigmoid(z)*sigmoid(-z)
sigmoid(z)*sigmoid(-z) |> render
(sigmoid(z)*sigmoid(-z)) |> ugly_simplify |> render

L"$\frac{1}{((1 + \mathrm{exp}(z)) * (1 + \mathrm{exp}(-(z))))}$"

L"$(2 + \mathrm{exp}(z) + \mathrm{exp}(-(z)))^{-1}$"

diff(sigmoid(z), z) ≖ sigmoid(z) * sigmoid(-z) = true


In [212]:
basic_simplify((sigmoid(z)*sigmoid(-z)) - exp(-z^2)) |> render

L"$\frac{(((-1 - (2 * \mathrm{exp}(z))) - \mathrm{exp}(2 * z)) + \mathrm{exp}(z + z^{2}))}{((2 * \mathrm{exp}(z + z^{2})) + \mathrm{exp}((2 * z) + z^{2}) + \mathrm{exp}(z^{2}))}$"

Our approximation to a PDF is:
$$f(x)=\frac{h(x)}{\int_{S}h(z)dz}=\frac{\frac{\partial N(x,\tilde{p})}{\partial x}}{N(max(S),\tilde{p})-N(min(S),\tilde{p})}$$.

As we have shown above the numerator $\frac{\partial N(x,\tilde{p})}{\partial x} = g(x,\tilde{p}) \;N(x,\tilde{p})$.

Where $g(x,\tilde{p})$ is  effectively a weighted sum of terms of forms similar to $\frac{\partial \sigma(z)}{\partial z} = \sigma(z)\sigma(-z) = \frac{1}{(1+\exp(-z)(1 + \exp(-z))}$.
As shown below that function defines a bell-curve.

The weights and baises of the penultimate layers of the network ($W$, $U$), define the shape and position of that bell-curve.
The  weight component of final expodential layer ($V$), determines the weighting of each bell-curve (though it also must scaled to counter effects of the prior layer weights).


$$f(x)=\frac{h(x)}{\int_{S}h(z)dz} = \frac{g(x,\tilde{p}) \; N(x,\tilde{p})}{N(max(S),\tilde{p})-N(min(S),\tilde{p})}$$.
For now we will focus on $g(x,\tilde{p})$


In [216]:
using Plots
plot(x->sigmoid(x)*sigmoid(-x))

It has been shown in [Bacharoglou, 2010, APPROXIMATION OF PROBABILITY DISTRIBUTIONS
BY CONVEX MIXTURES OF GAUSSIAN MEASURES](http://www.mathaware.org/journals/proc/2010-138-07/S0002-9939-10-10340-2/S0002-9939-10-10340-2.pdf)
that every PDF with compact support can be approximated by an weighted non-negative sum of gaussians. 
(It actually does it with a restricted subset of gaussians, but we don't need that)


Bacharogluo,2010 theorem 1 is noted as applicable to mixtures of any approximate identity ([defn](http://people.math.gatech.edu/~heil/7338/fall09/approxid.pdf)).   
It is easy to construct an approximate identity from sequences of $\frac{\partial \sigma(Wz+b)}{\partial z}$,
(though that depends on which form or approximate identity is there are many definitions)

Which directly gives us the result that $g(x, \tilde{p})$ can approximate any PDF with continuous support.


In [217]:
@vars x

(x,)

In [219]:
methodswith(BitVector)

  likely near In[219]:1
  likely near In[219]:1
  likely near In[219]:1
  likely near In[219]:1
  likely near In[219]:1
  likely near In[219]:1


In [221]:
@which falses(3)[2]