In [1]:
function only(itr)
    state = start(itr)
    val,state = next(itr, state)
    @assert(done(itr,state))
    return val
end

only (generic function with 1 method)

In [2]:
using SymEngine
import SymEngine: as_numer_denom
using Expr2LaTeX
Expr2LaTeX.render(symexpr::SymEngine.Basic)=symexpr |> repr |> parse |> render

In [66]:
function ugly_simplify(f)
    num,den = as_numer_denom(f)
    inv(expand(inv(num)*den))
end

function basic_simplify(f)
    num2,den2 = as_numer_denom(ugly_simplify)
    num2/den2
end

(a)≖(b) = ugly_simplify(a)==ugly_simplify(b)

### Tests
using Base.Test
@vars z
s1 = (1 + exp(-z))^(-1)
s2 = exp(z)/(1+exp(z))

@test s1 ≖ s2
@test inv(s1)≖inv(s2)

n1 = subs(s1, z=>-z)
n2 = 1-s1
n3 = subs(s2, z=>-z)
n4 = 1-s2
@test n1 ≖ n2 ≖ n3 ≖ n4
@test inv(n1)≖inv(n2) ≖ inv(n3) ≖ inv(n4)


[1m[32mTest Passed
[39m[22m

In [4]:
sigmoid(z)=(1 + exp(-z))^(-1)

sigmoid (generic function with 1 method)

In [5]:
function weight_var(name, input_size, output_size)
    [symbols("$(name)_$j$i") for j in 1:output_size, i in 1:input_size]
end
function bias_var(name, output_size)
    [symbols("$(name)_$j") for j in 1:output_size]
end



bias_var (generic function with 1 method)

## Basic Network

Single input single output
1 hidden layer

In [99]:
@vars x
W = weight_var("W", 1, 3)
b = bias_var("b",3)
V = weight_var("V", 3, 1)

z=(sigmoid.(W*x + b))
N=V*z |> only
render(N)

L"$(\frac{V_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})$"

In [96]:
dNdx = diff(N,x)
render(dNdx)

L"$(\frac{(\mathrm{exp}(-(b_{1} + (x * W_{11}))) * W_{11} * V_{11})}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))^{2}} + \frac{(\mathrm{exp}(-(b_{2} + (x * W_{21}))) * W_{21} * V_{12})}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))^{2}} + \frac{(\mathrm{exp}(-(b_{3} + (x * W_{31}))) * W_{31} * V_{13})}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))^{2}})$"

In [97]:
# I am pretty sure this is the right equation for the derivative of a plain nerual network
# But symengine is not strong enough to  recognise that it matches the above
mdNdx = (W'.*V)*(sigmoid.(-(W*x + b)).*sigmoid.(W*x + b)) |> first
render.(mdNdx)

L"$(\frac{(W_{11} * V_{11})}{((1 + \mathrm{exp}(b_{1} + (x * W_{11}))) * (1 + \mathrm{exp}(-(b_{1} + (x * W_{11})))))} + \frac{(W_{21} * V_{12})}{((1 + \mathrm{exp}(b_{2} + (x * W_{21}))) * (1 + \mathrm{exp}(-(b_{2} + (x * W_{21})))))} + \frac{(W_{31} * V_{13})}{((1 + \mathrm{exp}(b_{3} + (x * W_{31}))) * (1 + \mathrm{exp}(-(b_{3} + (x * W_{31})))))})$"

## Expodential Output net
Single input single output still,
1 hidden layer,
output expodential,

In [101]:
@vars x
W = weight_var("W", 1, 3)
b = bias_var("b",3)
V = weight_var("V", 3, 1)

z=sigmoid.(W*x + b)
N=exp.(V*z) |> only
render(N)

L"$\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})$"

In [102]:
dNdx = diff(N,x)
render(dNdx)

L"$(\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))}) * (\frac{(\mathrm{exp}(-(b_{1} + (x * W_{11}))) * W_{11} * V_{11})}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))^{2}} + \frac{(\mathrm{exp}(-(b_{2} + (x * W_{21}))) * W_{21} * V_{12})}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))^{2}} + \frac{(\mathrm{exp}(-(b_{3} + (x * W_{31}))) * W_{31} * V_{13})}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))^{2}}))$"

In [103]:
# I am pretty sure this is the right equation for the derivative of this network
# as we just use the standard `d(exp(f(x))/dx = f'(x)*exp(f(x))` rule
$
# But symengine is not strong enough to  recognise that it matches the above
mdZm1dx = (W'.*V)*(sigmoid.(-(W*x + b)).*sigmoid.(W*x + b)) |> first
mdNdx = mdZm1dx*N
render.(mdNdx)

L"$(\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))}) * (\frac{(W_{11} * V_{11})}{((1 + \mathrm{exp}(b_{1} + (x * W_{11}))) * (1 + \mathrm{exp}(-(b_{1} + (x * W_{11})))))} + \frac{(W_{21} * V_{12})}{((1 + \mathrm{exp}(b_{2} + (x * W_{21}))) * (1 + \mathrm{exp}(-(b_{2} + (x * W_{21})))))} + \frac{(W_{31} * V_{13})}{((1 + \mathrm{exp}(b_{3} + (x * W_{31}))) * (1 + \mathrm{exp}(-(b_{3} + (x * W_{31})))))}))$"

## Deeper Expodential Output net
Single input single output still,
2 hidden layer,
output expodential,

In [105]:
@vars x
W = weight_var("W", 1, 3)
b = bias_var("b",3)
U = weight_var("U", 3, 3)
c = bias_var("c",3)
V = weight_var("V", 3, 1)

z1=sigmoid.(W*x + b)
z2=sigmoid.(U*z1 + c)
N=exp.(V*z2) |> only
render(N)

L"$\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(c_{1} + \frac{U_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(c_{2} + \frac{U_{21}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{22}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{23}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(c_{3} + \frac{U_{31}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{32}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{33}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))})$"

In [106]:
dNdx = diff(N,x)
render(dNdx)

L"$(\mathrm{exp}(\frac{V_{11}}{(1 + \mathrm{exp}(-(c_{1} + \frac{U_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))} + \frac{V_{12}}{(1 + \mathrm{exp}(-(c_{2} + \frac{U_{21}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{22}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{23}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))} + \frac{V_{13}}{(1 + \mathrm{exp}(-(c_{3} + \frac{U_{31}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{32}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{33}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})))}) * (\frac{(\mathrm{exp}(-(c_{1} + \frac{U_{11}}{(1 + \mathrm{exp}(-(b_{1} + (x * W_{11}))))} + \frac{U_{12}}{(1 + \mathrm{exp}(-(b_{2} + (x * W_{21}))))} + \frac{U_{13}}{(1 + \mathrm{exp}(-(b_{3} + (x * W_{31}))))})) * V_{11} * (\frac{(\mathrm{exp}(-(b_{1} + (x * W_{11}))) * W_{11} * U_{11})}{(1 +

In [107]:
# TODO workout the clean form of this

### Find the parts that count to the gradient

In [151]:
using MacroTools: postwalk
tree = parse(repr(dNdx))

tree = postwalk(x -> @capture(x, exp(z_)) ? 1 : x, tree)
tree = postwalk(x -> @capture(x, (1+1)^2) ? 1 : x, tree)
tree = postwalk(x -> @capture(x, z_/1) ? z : x, tree)
tree |> render

L"$(1 * ((1 * V_{11} * ((1 * W_{11} * U_{11}) + (1 * W_{21} * U_{12}) + (1 * W_{31} * U_{13}))) + (1 * V_{12} * ((1 * W_{11} * U_{21}) + (1 * W_{21} * U_{22}) + (1 * W_{31} * U_{23}))) + (1 * V_{13} * ((1 * W_{11} * U_{31}) + (1 * W_{21} * U_{32}) + (1 * W_{31} * U_{33})))))$"

In [154]:
expand(V*U*W |> only)

W_11*V_11*U_11 + W_11*V_12*U_21 + W_11*V_13*U_31 + W_21*V_11*U_12 + W_21*V_12*U_22 + W_21*V_13*U_32 + W_31*V_11*U_13 + W_31*V_12*U_23 + W_31*V_13*U_33