### References:

- https://discourse.julialang.org/t/slow-arbitrary-base-exponentiation-a-b/25386/21
- https://docs.julialang.org/en/v1/manual/performance-tips/#man-performance-annotations-1
- http://llvm.org/docs/LangRef.html#fast-math-flags

In [4]:
using Revise, BernsteinExpansions, DynamicPolynomials, BenchmarkTools, StaticArrays, Test

┌ Info: Precompiling BernsteinExpansions [afc6e731-2783-597c-a3b4-470ee96642a0]
└ @ Base loading.jl:1273


In [5]:
using BernsteinExpansions: fastpow # DiffEqBase implementation

In [2]:
# https://github.com/etheory/fastapprox/blob/master/fastapprox/src/fastlog.h
function fastlog2(x::Float32)::Float32
    y = Float32(reinterpret(Int32, x))
    y *= 1.1920928955078125f-7
    y - 126.94269504f0
end
function fastlog2(x::Float64)::Float32
   fastlog2(Float32(x))
end

# https://github.com/etheory/fastapprox/blob/master/fastapprox/src/fastexp.h
function _fastpow2(x::Float32)::Float32
    clipp = x < -126.0f0 ? -126.0f0 : x
    clipp = min(126f0, max(-126f0, x))
    reinterpret(Float32, UInt32((1 << 23) * (clipp + 126.94269504f0)))
end
function _fastpow2(x::Float64)::Float32
   _fastpow2(Float32(x))
end

# https://github.com/etheory/fastapprox/blob/master/fastapprox/src/fastpow.h
function fastpow2(x::Real, y::Real)::Real
    _fastpow2(y * fastlog2(x))
end

fastpow2 (generic function with 1 method)

In [6]:
@btime fastpow2(2.0, 4)

  0.016 ns (0 allocations: 0 bytes)


18.750488f0

In [7]:
@btime fastpow(2.0, 4)

  0.015 ns (0 allocations: 0 bytes)


16.0f0

In [9]:
Float64(fastpow2(2.0, 4))

18.75048828125

In [10]:
Float64(fastpow(2.0, 4))

16.0

In [47]:
function uni_1(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = Vector{N}(undef, l+1)
    for i in 0:l
        coeffs[i+1] = low^(k-i) * high^i
    end
    return coeffs
end

# using Julia's ^, no fastmath
function uni_2(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = Vector{N}(undef, l+1)
    for i in 0:l
        @inbounds coeffs[i+1] = low^(k-i) * high^i
    end
    return coeffs
end

function uni_3(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = zeros(N, l+1)
    @simd for i in 0:l
        @inbounds coeffs[i+1] = low^(k-i) * high^i
    end
    return coeffs
end

# using Julia's ^ with fastmath
function uni_4(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = Vector{N}(undef, l+1)
    @fastmath for i in 0:l
        @inbounds coeffs[i+1] = low^(k-i) * high^i
    end
    return coeffs
end

function uni_5(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = Vector{N}(undef, l+1)
    @fastmath for i in 0:l
        @inbounds coeffs[i+1] = low^(k-i) * high^i
    end
    return coeffs
end

function uni_6(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = MVector{l+1, Float64}(undef)
    @fastmath for i in 0:l
        @inbounds coeffs[i+1] = low^(k-i) * high^i
    end
    return coeffs
end

function uni_7(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = Vector{N}(undef, l+1)
    @fastmath for i in 0:l
        @inbounds coeffs[i+1] = low^(k-i) * high^i
    end
    return coeffs
end

# using fastpow and fastmath
function uni_8(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = Vector{N}(undef, l+1)
    @fastmath for i in 0:l
        @inbounds coeffs[i+1] = fastpow(low, k-i) * fastpow(high, i)
    end
    return coeffs
end

# using fastpow and not fastmath
function uni_9(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = Vector{N}(undef, l+1)
    for i in 0:l
        @inbounds coeffs[i+1] = fastpow(low, k-i) * fastpow(high, i)
    end
    return coeffs
end

function uni_10(k::Integer, l::Integer, low::N, high::N) where {N}
    coeffs = Vector{N}(undef, l+1)
    for i in 0:l
        @inbounds coeffs[i+1] = fastpow2(low, k-i) * fastpow2(high, i)
    end
    return coeffs
end

uni_10 (generic function with 1 method)

In [48]:
@btime uni_1(3, 3, 1.0, 2.0)
@btime uni_2(3, 3, 1.0, 2.0) # using Julia's ^, no fastmath
@btime uni_3(3, 3, 1.0, 2.0)
@btime uni_4(3, 3, 1.0, 2.0) # using Julia's ^ and fastmath << FASTEST
@btime uni_5(3, 3, 1.0, 2.0)
@btime uni_6(3, 3, 1.0, 2.0)

  97.147 ns (1 allocation: 112 bytes)
  99.278 ns (1 allocation: 112 bytes)
  104.506 ns (1 allocation: 112 bytes)
  39.151 ns (1 allocation: 112 bytes)
  38.942 ns (1 allocation: 112 bytes)
  1.093 μs (18 allocations: 816 bytes)


4-element MArray{Tuple{4},Float64,1,4} with indices SOneTo(4):
 1.0
 2.0
 4.0
 8.0

In [49]:
@btime uni_7(3, 3, 1.0, 2.0)
@btime uni_8(3, 3, 1.0, 2.0) # using fastpow and fastmath 
@btime uni_9(3, 3, 1.0, 2.0) # using fastpow and NOT fastmath << twice as slow than uni_4, *but* faster than uni_2
@btime uni_10(3, 3, 1.0, 2.0)

  38.769 ns (1 allocation: 112 bytes)
  80.393 ns (1 allocation: 112 bytes)
  80.183 ns (1 allocation: 112 bytes)
  58.393 ns (1 allocation: 112 bytes)


4-element Array{Float64,1}:
 1.0826728343963623
 2.1146087646484375
 4.229248046875    
 8.661382675170898 

In [50]:
uni_10(3, 3, 1.0, 2.0)

4-element Array{Float64,1}:
 1.0826728343963623
 2.1146087646484375
 4.229248046875    
 8.661382675170898 

## Test

In [2]:
using Revise, BernsteinExpansions, DynamicPolynomials, BenchmarkTools, StaticArrays, Test

┌ Info: Precompiling BernsteinExpansions [afc6e731-2783-597c-a3b4-470ee96642a0]
└ @ Base loading.jl:1273


In [3]:
@polyvar x
m = x^3

x³

In [6]:
@btime univariate($m, 3, 1..2)

  43.479 ns (1 allocation: 112 bytes)


4-element Array{Float64,1}:
 1.0
 2.0
 4.0
 8.0

In [8]:
m = x^5
@btime univariate($m, 2, 0..1)

  39.891 ns (1 allocation: 112 bytes)


3-element Array{Float64,1}:
 0.0
 0.0
 0.0

In [13]:
@btime univariate(x^2, 2, 0..4)

  105.720 ns (4 allocations: 336 bytes)


3-element Array{Float64,1}:
  0.0
  0.0
 16.0

In [28]:
coeffs = Vector{Float64}(undef, 4)
using BernsteinExpansions: _univariate!

In [29]:
@btime _univariate!($coeffs, 4, 3, 1.0, 4.0, Val(:fastmath))
@btime _univariate!($coeffs, 4, 3, 1.0, 4.0, Val(:fastpow))
@btime _univariate!($coeffs, 4, 3, 1.0, 4.0, Val(:base))

  21.285 ns (0 allocations: 0 bytes)
  63.144 ns (0 allocations: 0 bytes)
  87.559 ns (0 allocations: 0 bytes)


4-element Array{Float64,1}:
  1.0
  4.0
 16.0
 64.0

In [26]:
m = x^4
@btime univariate($m, 3, 1..4)

  43.859 ns (1 allocation: 112 bytes)


4-element Array{Float64,1}:
  1.0
  4.0
 16.0
 64.0

In [27]:
univariate(x^4, 3, 1..4)

4-element Array{Float64,1}:
  1.0
  4.0
 16.0
 64.0

In [31]:
@which 2.0^4