From 0cb173695ba2083812d9d6ddba665ee0f5702dce Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Tue, 25 Dec 2018 02:01:07 -0600
Subject: [PATCH 1/7] Added support for SIMD.jl.

---
 Manifest.toml |  39 +++++
 Project.toml  |   9 +-
 src/SLEEF.jl  |  22 +++
 src/double.jl | 111 +++++++-------
 src/exp.jl    |  95 ++++++------
 src/hyp.jl    |  63 ++++----
 src/log.jl    |  86 ++++++-----
 src/misc.jl   | 104 ++++++-------
 src/priv.jl   | 214 ++++++++++++++-------------
 src/trig.jl   | 395 +++++++++++++++++++++++++++++---------------------
 src/utils.jl  |  49 +++++--
 11 files changed, 690 insertions(+), 497 deletions(-)
 create mode 100644 Manifest.toml

diff --git a/Manifest.toml b/Manifest.toml
new file mode 100644
index 0000000..ee50dd2
--- /dev/null
+++ b/Manifest.toml
@@ -0,0 +1,39 @@
+# This file is machine-generated - editing it directly is not advised
+
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[Random]]
+deps = ["Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[SIMD]]
+deps = ["InteractiveUtils", "Test"]
+git-tree-sha1 = "c6f6f4a8103e69c66b06def75f04904f72111c51"
+uuid = "fdea26ae-647d-5447-a871-4b548cad5224"
+version = "2.2.0"
+
+[[Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[Test]]
+deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/Project.toml b/Project.toml
index 123e1f5..7dbd6b1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,11 +2,14 @@ name = "SLEEF"
 uuid = "3e6341c9-01f6-5312-b33f-f8894a2e817a"
 license = "MIT"
 repo = "https://github.com/musm/SLEEF.jl.git"
-version = "v0.5.1"
+version = "0.5.1"
+
+[deps]
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 
 [extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test","Printf"]
+test = ["Test", "Printf"]
diff --git a/src/SLEEF.jl b/src/SLEEF.jl
index 23d8d6b..0065690 100644
--- a/src/SLEEF.jl
+++ b/src/SLEEF.jl
@@ -8,6 +8,28 @@ module SLEEF
 
 using Base.Math: uinttype, @horner, exponent_bias, exponent_mask, significand_bits, IEEEFloat, exponent_raw_max
 
+using SIMD
+using SIMD: vifelse
+
+const FloatType64 = Union{Float64,Vec{<:Any,Float64}}
+const FloatType32 = Union{Float32,Vec{<:Any,Float32}}
+const FloatType = Union{FloatType64,FloatType32}
+const IntegerType64 = Union{Int64,Vec{<:Any,Int64}}
+const IntegerType32 = Union{Int32,Vec{<:Any,Int32}}
+const IntegerType = Union{IntegerType64,IntegerType32}
+
+EquivalentInteger(::Type{Float64}) = Int64
+EquivalentInteger(::Type{Float32}) = Int32
+EquivalentInteger(::Type{Vec{N,Float64}}) where N = Vec{N,Int64}
+EquivalentInteger(::Type{Vec{N,Float32}}) where N = Vec{N,Int32}
+
+@generated function Base.unsafe_trunc(::Type{I}, x::Vec{N,T}) where {N,T,I}
+    quote
+        $(Expr(:meta,:inline))
+        Vec{$N,$I}($(Expr(:tuple, [:(Core.VecElement(unsafe_trunc($I, x[$n]))) for n ∈ 1:N]...)))
+    end
+end
+
 ## constants
 
 const MLN2  = 6.931471805599453094172321214581765680755001343602552541206800094933936219696955e-01 # log(2)
diff --git a/src/double.jl b/src/double.jl
index d073ef8..c376c77 100644
--- a/src/double.jl
+++ b/src/double.jl
@@ -1,18 +1,27 @@
 import Base: -, <, copysign, flipsign, convert
 
-struct Double{T<:IEEEFloat} <: Number
+const vIEEEFloat = Union{IEEEFloat,Vec{<:Any,<:IEEEFloat}}
+
+struct Double{T<:vIEEEFloat} <: Number
     hi::T
     lo::T
 end
-Double(x::T) where {T<:IEEEFloat} = Double(x, zero(T))
+Double(x::T) where {T<:vIEEEFloat} = Double(x, zero(T))
 
-(::Type{T})(x::Double{T}) where {T<:IEEEFloat} = x.hi + x.lo
+(::Type{T})(x::Double{T}) where {T<:vIEEEFloat} = x.hi + x.lo
 
 
 @inline trunclo(x::Float64) = reinterpret(Float64, reinterpret(UInt64, x) & 0xffff_ffff_f800_0000) # clear lower 27 bits (leave upper 26 bits)
 @inline trunclo(x::Float32) = reinterpret(Float32, reinterpret(UInt32, x) & 0xffff_f000) # clear lowest 12 bits (leave upper 12 bits)
 
-@inline function splitprec(x::IEEEFloat)
+@inline function trunclo(x::Vec{N,Float64}) where N
+    reinterpret(Vec{N,Float64}, reinterpret(Vec{N,UInt64}, x) & 0xffff_ffff_f800_0000) # clear lower 27 bits (leave upper 26 bits)
+end
+@inline function trunclo(x::Vec{N,Float32}) where N
+    reinterpret(Vec{N,Float32}, reinterpret(Vec{N,UInt32}, x) & 0xffff_f000) # clear lowest 12 bits (leave upper 12 bits)
+end
+
+@inline function splitprec(x::vIEEEFloat)
     hx = trunclo(x)
     hx, x - hx
 end
@@ -22,171 +31,175 @@ end
     Double(r, (x.hi - r) + x.lo)
 end
 
-@inline flipsign(x::Double{T}, y::T) where {T<:IEEEFloat} = Double(flipsign(x.hi, y), flipsign(x.lo, y))
+@inline flipsign(x::Double{<:vIEEEFloat}, y::vIEEEFloat) = Double(flipsign(x.hi, y), flipsign(x.lo, y))
 
-@inline scale(x::Double{T}, s::T) where {T<:IEEEFloat} = Double(s * x.hi, s * x.lo)
+@inline scale(x::Double{<:vIEEEFloat}, s::vIEEEFloat) = Double(s * x.hi, s * x.lo)
 
 
-@inline (-)(x::Double{T}) where {T<:IEEEFloat} = Double(-x.hi, -x.lo)
+@inline (-)(x::Double{T}) where {T<:vIEEEFloat} = Double(-x.hi, -x.lo)
 
-@inline function (<)(x::Double{T}, y::Double{T}) where {T<:IEEEFloat}
+@inline function (<)(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat})
     x.hi < y.hi
 end
 
-@inline function (<)(x::Double{T}, y::Number) where {T<:IEEEFloat}
+@inline function (<)(x::Double{<:vIEEEFloat}, y::Union{Number,Vec})
     x.hi < y
 end
 
-@inline function (<)(x::Number, y::Double{T}) where {T<:IEEEFloat}
+@inline function (<)(x::Union{Number,Vec}, y::Double{<:vIEEEFloat})
     x < y.hi
 end
 
 # quick-two-sum x+y
-@inline function dadd(x::T, y::T) where {T<:IEEEFloat} #WARNING |x| >= |y|
+@inline function dadd(x::vIEEEFloat, y::vIEEEFloat) #WARNING |x| >= |y|
     s = x + y
     Double(s, (x - s) + y)
 end
 
-@inline function dadd(x::T, y::Double{T}) where {T<:IEEEFloat} #WARNING |x| >= |y|
+@inline function dadd(x::vIEEEFloat, y::Double{<:vIEEEFloat}) #WARNING |x| >= |y|
     s = x + y.hi
     Double(s, (x - s) + y.hi + y.lo)
 end
 
-@inline function dadd(x::Double{T}, y::T) where {T<:IEEEFloat} #WARNING |x| >= |y|
+@inline function dadd(x::Double{<:vIEEEFloat}, y::vIEEEFloat) #WARNING |x| >= |y|
     s = x.hi + y
     Double(s, (x.hi - s) + y + x.lo)
 end
 
-@inline function dadd(x::Double{T}, y::Double{T}) where {T<:IEEEFloat} #WARNING |x| >= |y|
+@inline function dadd(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}) #WARNING |x| >= |y|
     s = x.hi + y.hi
     Double(s, (x.hi - s) + y.hi + y.lo + x.lo)
 end
 
-@inline function dsub(x::Double{T}, y::Double{T}) where {T<:IEEEFloat} #WARNING |x| >= |y|
+@inline function dsub(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}) #WARNING |x| >= |y|
     s = x.hi - y.hi
     Double(s, (x.hi - s) - y.hi - y.lo + x.lo)
 end
 
-@inline function dsub(x::Double{T}, y::T) where {T<:IEEEFloat} #WARNING |x| >= |y|
+@inline function dsub(x::Double{<:vIEEEFloat}, y::vIEEEFloat) #WARNING |x| >= |y|
     s = x.hi - y
     Double(s, (x.hi - s) - y + x.lo)
 end
 
-@inline function dsub(x::T, y::Double{T}) where {T<:IEEEFloat} #WARNING |x| >= |y|
+@inline function dsub(x::vIEEEFloat, y::Double{<:vIEEEFloat}) #WARNING |x| >= |y|
     s = x - y.hi
     Double(s, (x - s) - y.hi - y.lo)
 end
 
-@inline function dsub(x::T, y::T) where {T<:IEEEFloat} #WARNING |x| >= |y|
+@inline function dsub(x::vIEEEFloat, y::vIEEEFloat) #WARNING |x| >= |y|
     s = x - y
     Double(s, (x - s) - y)
 end
 
 
 # two-sum x+y  NO BRANCH
-@inline function dadd2(x::T, y::T) where {T<:IEEEFloat}
+@inline function dadd2(x::vIEEEFloat, y::vIEEEFloat)
     s = x + y
     v = s - x
     Double(s, (x - (s - v)) + (y - v))
 end
 
-@inline function dadd2(x::T, y::Double{T}) where {T<:IEEEFloat}
+@inline function dadd2(x::vIEEEFloat, y::Double{<:vIEEEFloat})
     s = x + y.hi
     v = s - x
     Double(s, (x - (s - v)) + (y.hi - v) + y.lo)
 end
 
-@inline dadd2(x::Double{T}, y::T) where {T<:IEEEFloat} = dadd2(y, x)
+@inline dadd2(x::Double{<:vIEEEFloat}, y::vIEEEFloat) = dadd2(y, x)
 
-@inline function dadd2(x::Double{T}, y::Double{T}) where {T<:IEEEFloat}
+@inline function dadd2(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat}) where {T<:vIEEEFloat}
     s = x.hi + y.hi
     v = s - x.hi
     Double(s, (x.hi - (s - v)) + (y.hi - v) + x.lo + y.lo)
 end
 
-@inline function dsub2(x::T, y::T) where {T<:IEEEFloat}
+@inline function dsub2(x::vIEEEFloat, y::vIEEEFloat)
     s = x - y
     v = s - x
     Double(s, (x - (s - v)) + (-y - v))
 end
 
-@inline function dsub2(x::T, y::Double{T}) where {T<:IEEEFloat}
+@inline function dsub2(x::vIEEEFloat, y::Double{<:vIEEEFloat})
     s = x - y.hi
     v = s - x
     Double(s, (x - (s - v)) + (-y.hi - v) - y.lo)
 end
 
-@inline function dsub2(x::Double{T}, y::T) where {T<:IEEEFloat}
+@inline function dsub2(x::Double{<:vIEEEFloat}, y::vIEEEFloat)
     s = x.hi - y
     v = s - x.hi
     Double(s, (x.hi - (s - v)) + (-y - v) + x.lo)
 end
 
-@inline function dsub2(x::Double{T}, y::Double{T}) where {T<:IEEEFloat}
+@inline function dsub2(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat})
     s = x.hi - y.hi
     v = s - x.hi
     Double(s, (x.hi - (s - v)) + (-y.hi - v) + x.lo - y.lo)
 end
 
-
+@inline (::Type{Vec{N,T}})(x::Vec{N,T}) where {N,T} = x
+@inline function SIMD.vifelse(b::Vec{N,Bool}, x::Double{T1}, y::Double{T2}) where {N,T<:Union{Float32,Float64},T1<:Union{T,Vec{N,T}},T2<:Union{T,Vec{N,T}}}
+    V = Vec{N,T}
+    Double(vifelse(b, V(x.hi), V(y.hi)), vifelse(b, V(x.lo), V(y.lo)))
+end
 
 if FMA_FAST
 
     # two-prod-fma
-    @inline function dmul(x::T, y::T) where {T<:IEEEFloat}
+    @inline function dmul(x::vIEEEFloat, y::vIEEEFloat)
         z = x * y
         Double(z, fma(x, y, -z))
     end
 
-    @inline function dmul(x::Double{T}, y::T) where {T<:IEEEFloat}
+    @inline function dmul(x::Double{<:vIEEEFloat}, y::vIEEEFloat)
         z = x.hi * y
         Double(z, fma(x.hi, y, -z) + x.lo * y)
     end
 
-    @inline dmul(x::T, y::Double{T}) where {T<:IEEEFloat} = dmul(y, x)
+    @inline dmul(x::vIEEEFloat, y::Double{<:vIEEEFloat}) = dmul(y, x)
 
-    @inline function dmul(x::Double{T}, y::Double{T}) where {T<:IEEEFloat}
+    @inline function dmul(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat})
         z = x.hi * y.hi
         Double(z, fma(x.hi, y.hi, -z) + x.hi * y.lo + x.lo * y.hi)
     end
 
     # x^2
-    @inline function dsqu(x::T) where {T<:IEEEFloat}
+    @inline function dsqu(x::T) where {T<:vIEEEFloat}
         z = x * x
         Double(z, fma(x, x, -z))
     end
 
-    @inline function dsqu(x::Double{T}) where {T<:IEEEFloat}
+    @inline function dsqu(x::Double{T}) where {T<:vIEEEFloat}
         z = x.hi * x.hi
         Double(z, fma(x.hi, x.hi, -z) + x.hi * (x.lo + x.lo))
     end
 
     # sqrt(x)
-    @inline function dsqrt(x::Double{T}) where {T<:IEEEFloat}
+    @inline function dsqrt(x::Double{T}) where {T<:vIEEEFloat}
         zhi = _sqrt(x.hi)
         Double(zhi, (x.lo + fma(-zhi, zhi, x.hi)) / (zhi + zhi))
     end
 
     # x/y
-    @inline function ddiv(x::Double{T}, y::Double{T}) where {T<:IEEEFloat}
+    @inline function ddiv(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat})
         invy = 1 / y.hi
         zhi = x.hi * invy
         Double(zhi, (fma(-zhi, y.hi, x.hi) + fma(-zhi, y.lo, x.lo)) * invy)
     end
 
-    @inline function ddiv(x::T, y::T) where {T<:IEEEFloat}
+    @inline function ddiv(x::vIEEEFloat, y::vIEEEFloat)
         ry = 1 / y
         r = x * ry
         Double(r, fma(-r, y, x) * ry)
     end
 
     # 1/x
-    @inline function drec(x::T) where {T<:IEEEFloat}
+    @inline function drec(x::vIEEEFloat)
         zhi = 1 / x
         Double(zhi, fma(-zhi, x, one(T)) * zhi)
     end
 
-    @inline function drec(x::Double{T}) where {T<:IEEEFloat}
+    @inline function drec(x::Double{<:vIEEEFloat})
         zhi = 1 / x.hi
         Double(zhi, (fma(-zhi, x.hi, one(T)) + -zhi * x.lo) * zhi)
     end
@@ -194,23 +207,23 @@ if FMA_FAST
 else
 
     #two-prod x*y
-    @inline function dmul(x::T, y::T) where {T<:IEEEFloat}
+    @inline function dmul(x::vIEEEFloat, y::vIEEEFloat)
         hx, lx = splitprec(x)
         hy, ly = splitprec(y)
         z = x * y
         Double(z, ((hx * hy - z) + lx * hy + hx * ly) + lx * ly)
     end
 
-    @inline function dmul(x::Double{T}, y::T) where {T<:IEEEFloat}
+    @inline function dmul(x::Double{<:vIEEEFloat}, y::vIEEEFloat)
         hx, lx = splitprec(x.hi)
         hy, ly = splitprec(y)
         z = x.hi * y
         Double(z, (hx * hy - z) + lx * hy + hx * ly + lx * ly + x.lo * y)
     end
 
-    @inline dmul(x::T, y::Double{T}) where {T<:IEEEFloat} = dmul(y, x)
+    @inline dmul(x::vIEEEFloat, y::Double{<:vIEEEFloat}) = dmul(y, x)
 
-    @inline function dmul(x::Double{T}, y::Double{T}) where {T<:IEEEFloat}
+    @inline function dmul(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat})
         hx, lx = splitprec(x.hi)
         hy, ly = splitprec(y.hi)
         z = x.hi * y.hi
@@ -218,34 +231,34 @@ else
     end
 
     # x^2
-    @inline function dsqu(x::T) where {T<:IEEEFloat}
+    @inline function dsqu(x::T) where {T<:vIEEEFloat}
         hx, lx = splitprec(x)
         z = x * x
         Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx)
     end
 
-    @inline function dsqu(x::Double{T}) where {T<:IEEEFloat}
+    @inline function dsqu(x::Double{T}) where {T<:vIEEEFloat}
         hx, lx = splitprec(x.hi)
         z = x.hi * x.hi
         Double(z, (hx * hx - z) + lx * (hx + hx) + lx * lx + x.hi * (x.lo + x.lo))
     end
 
     # sqrt(x)
-    @inline function dsqrt(x::Double{T}) where {T<:IEEEFloat}
+    @inline function dsqrt(x::Double{T}) where {T<:vIEEEFloat}
         c = _sqrt(x.hi)
         u = dsqu(c)
         Double(c, (x.hi - u.hi - u.lo + x.lo) / (c + c))
     end
 
     # x/y
-    @inline function ddiv(x::Double{T}, y::Double{T}) where {T<:IEEEFloat}
+    @inline function ddiv(x::Double{<:vIEEEFloat}, y::Double{<:vIEEEFloat})
         invy = 1 / y.hi
         c = x.hi * invy
         u = dmul(c, y.hi)
         Double(c, ((((x.hi - u.hi) - u.lo) + x.lo) - c * y.lo) * invy)
     end
 
-    @inline function ddiv(x::T, y::T) where {T<:IEEEFloat}
+    @inline function ddiv(x::vIEEEFloat, y::vIEEEFloat)
         ry = 1 / y
         r = x * ry
         hx, lx = splitprec(r)
@@ -255,13 +268,13 @@ else
 
 
     # 1/x
-    @inline function drec(x::T) where {T<:IEEEFloat}
+    @inline function drec(x::T) where {T<:vIEEEFloat}
         c = 1 / x
         u = dmul(c, x)
         Double(c, (one(T) - u.hi - u.lo) * c)
     end
 
-    @inline function drec(x::Double{T}) where {T<:IEEEFloat}
+    @inline function drec(x::Double{T}) where {T<:vIEEEFloat}
         c = 1 / x.hi
         u = dmul(c, x.hi)
         Double(c, (one(T) - u.hi - u.lo - c * x.lo) * c)
diff --git a/src/exp.jl b/src/exp.jl
index 3e5820a..4870c59 100644
--- a/src/exp.jl
+++ b/src/exp.jl
@@ -5,7 +5,7 @@
 
 Computes `a × 2^n`
 """
-ldexp(x::Union{Float32,Float64}, q::Int) = ldexpk(x, q)
+ldexp(x::FloatType, q::IntegerType) = ldexpk(x, q)
 
 
 const max_exp2(::Type{Float64}) = 1024
@@ -14,7 +14,7 @@ const max_exp2(::Type{Float32}) = 128f0
 const min_exp2(::Type{Float64}) = -1075
 const min_exp2(::Type{Float32}) = -150f0
 
-@inline function exp2_kernel(x::Float64)
+@inline function exp2_kernel(x::FloatType64)
     c11 = 0.4434359082926529454e-9
     c10 = 0.7073164598085707425e-8
     c9  = 0.1017819260921760451e-6
@@ -26,17 +26,17 @@ const min_exp2(::Type{Float32}) = -150f0
     c3  = 0.5550410866482046596e-1
     c2  = 0.2402265069591012214
     c1  = 0.6931471805599452862
-    return @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11
+    @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11
 end
 
-@inline function exp2_kernel(x::Float32)
+@inline function exp2_kernel(x::FloatType32)
     c6 = 0.1535920892f-3
     c5 = 0.1339262701f-2
     c4 = 0.9618384764f-2
     c3 = 0.5550347269f-1
     c2 = 0.2402264476f0
     c1 = 0.6931471825f0
-    return @horner x c1 c2 c3 c4 c5 c6
+    @horner x c1 c2 c3 c4 c5 c6
 end
 
 """
@@ -44,30 +44,31 @@ end
 
 Compute the base-`2` exponential of `x`, that is `2ˣ`.
 """
-function exp2(d::T) where {T<:Union{Float32,Float64}}
+function exp2(d::V) where V <: FloatType
+    T = eltype(d)
     q = round(d)
-    qi = unsafe_trunc(Int, q)
+    qi = unsafe_trunc(EquivalentInteger(T), q)
 
     s = d - q
 
     u = exp2_kernel(s)
-    u = T(dnormalize(dadd(T(1.0), dmul(u,s))))
+    u = V(dnormalize(dadd(T(1.0), dmul(u,s))))
 
     u = ldexp2k(u, qi)
 
-    d > max_exp2(T) && (u = T(Inf))
-    d < min_exp2(T) && (u = T(0.0))
-    return u
+    u = vifelse(d > max_exp2(T), T(Inf), u)
+    u = vifelse(d < min_exp2(T), T(0.0), u)
+    u
 end
 
 
-const max_exp10(::Type{Float64}) = 3.08254715559916743851e2 # log 2^1023*(2-2^-52)
-const max_exp10(::Type{Float32}) = 38.531839419103626f0 # log 2^127 *(2-2^-23) 
+const max_exp10(::Type{<:FloatType64}) = 3.08254715559916743851e2 # log 2^1023*(2-2^-52)
+const max_exp10(::Type{<:FloatType32}) = 38.531839419103626f0 # log 2^127 *(2-2^-23)
 
-const min_exp10(::Type{Float64}) = -3.23607245338779784854769e2 # log10 2^-1075
-const min_exp10(::Type{Float32}) = -45.15449934959718f0         # log10 2^-150
+const min_exp10(::Type{<:FloatType64}) = -3.23607245338779784854769e2 # log10 2^-1075
+const min_exp10(::Type{<:FloatType32}) = -45.15449934959718f0         # log10 2^-150
 
-@inline function exp10_kernel(x::Float64)
+@inline function exp10_kernel(x::FloatType64)
     c11 = 0.2411463498334267652e-3
     c10 = 0.1157488415217187375e-2
     c9  = 0.5013975546789733659e-2
@@ -82,7 +83,7 @@ const min_exp10(::Type{Float32}) = -45.15449934959718f0         # log10 2^-150
     return @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11
 end
 
-@inline function exp10_kernel(x::Float32)
+@inline function exp10_kernel(x::FloatType32)
     c6 = 0.2064004987f0
     c5 = 0.5417877436f0
     c4 = 0.1171286821f1
@@ -97,52 +98,54 @@ end
 
 Compute the base-`10` exponential of `x`, that is `10ˣ`.
 """
-function exp10(d::T) where {T<:Union{Float32,Float64}}
+function exp10(d::V) where V <: FloatType
+    T = eltype(d)
     q = round(T(MLOG10_2) * d)
-    qi = unsafe_trunc(Int, q)
+    qi = unsafe_trunc(EquivalentInteger(T), q)
 
     s = muladd(q, -L10U(T), d)
     s = muladd(q, -L10L(T), s)
 
     u = exp10_kernel(s)
-    u = T(dnormalize(dadd(T(1.0), dmul(u,s))))
+    u = V(dnormalize(dadd(T(1.0), dmul(u,s))))
 
     u = ldexp2k(u, qi)
 
-    d > max_exp10(T) && (u = T(Inf))
-    d < min_exp10(T) && (u = T(0.0))
+    u = vifelse(d > max_exp10(T), T(Inf), u)
+    u = vifelse(d < min_exp10(T), T(0.0), u)
 
-    return u
+    u
 end
 
 
-const max_expm1(::Type{Float64}) = 7.09782712893383996732e2 # log 2^1023*(2-2^-52)
-const max_expm1(::Type{Float32}) = 88.72283905206835f0 # log 2^127 *(2-2^-23)
+const max_expm1(::Type{<:FloatType64}) = 7.09782712893383996732e2 # log 2^1023*(2-2^-52)
+const max_expm1(::Type{<:FloatType32}) = 88.72283905206835f0 # log 2^127 *(2-2^-23)
 
-const min_expm1(::Type{Float64}) = -37.42994775023704434602223
-const min_expm1(::Type{Float32}) = -17.3286790847778338076068394f0
+const min_expm1(::Type{<:FloatType64}) = -37.42994775023704434602223
+const min_expm1(::Type{<:FloatType32}) = -17.3286790847778338076068394f0
 
 """
     expm1(x)
 
 Compute `eˣ- 1` accurately for small values of `x`.
 """
-function expm1(x::T) where {T<:Union{Float32,Float64}}
+function expm1(x::FloatType)
+    T = eltype(x)
     u = T(dadd2(expk2(Double(x)), -T(1.0)))
-    x > max_expm1(T) && (u = T(Inf))
-    x < min_expm1(T) && (u = -T(1.0))
-    isnegzero(x) && (u = T(-0.0))
+    u = vifelse(x > max_expm1(T), T(Inf), u)
+    u = vifelse(x < min_expm1(T), T(-1.0), u)
+    u = vifelse(isnegzero(x), T(-0.0), u)
     return u
 end
 
 
-const max_exp(::Type{Float64}) = 709.78271114955742909217217426  # log 2^1023*(2-2^-52)
-const max_exp(::Type{Float32}) = 88.72283905206835f0             # log 2^127 *(2-2^-23)
+const max_exp(::Type{<:FloatType64}) = 709.78271114955742909217217426  # log 2^1023*(2-2^-52)
+const max_exp(::Type{<:FloatType32}) = 88.72283905206835f0             # log 2^127 *(2-2^-23)
 
-const min_exp(::Type{Float64}) = -7.451332191019412076235e2 # log 2^-1075
-const min_exp(::Type{Float32}) = -103.97208f0               # ≈ log 2^-150
+const min_exp(::Type{<:FloatType64}) = -7.451332191019412076235e2 # log 2^-1075
+const min_exp(::Type{<:FloatType32}) = -103.97208f0               # ≈ log 2^-150
 
-@inline function exp_kernel(x::Float64)
+@inline function exp_kernel(x::FloatType64)
     c11 = 2.08860621107283687536341e-09
     c10 = 2.51112930892876518610661e-08
     c9  = 2.75573911234900471893338e-07
@@ -154,17 +157,17 @@ const min_exp(::Type{Float32}) = -103.97208f0               # ≈ log 2^-150
     c3  = 0.0416666666666665047591422
     c2  = 0.166666666666666851703837
     c1  = 0.50
-    return @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11
+    @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11
 end
 
-@inline function exp_kernel(x::Float32)
+@inline function exp_kernel(x::FloatType32)
     c6 = 0.000198527617612853646278381f0
     c5 = 0.00139304355252534151077271f0
     c4 = 0.00833336077630519866943359f0
     c3 = 0.0416664853692054748535156f0
     c2 = 0.166666671633720397949219f0
     c1 = 0.5f0
-    return @horner x c1 c2 c3 c4 c5 c6
+    @horner x c1 c2 c3 c4 c5 c6
 end
 
 """
@@ -172,20 +175,20 @@ end
 
 Compute the base-`e` exponential of `x`, that is `eˣ`.
 """
-function exp(d::T) where {T<:Union{Float32,Float64}}
+function exp(d::FloatType)
+    T = eltype(d)
     q = round(T(MLN2E) * d)
-    qi = unsafe_trunc(Int, q)
+    qi = unsafe_trunc(EquivalentInteger(T), q)
 
     s = muladd(q, -L2U(T), d)
     s = muladd(q, -L2L(T), s)
 
     u = exp_kernel(s)
-
-    u = s * s * u + s + 1
+    u = muladd(s * s, u, s) + 1
     u = ldexp2k(u, qi)
 
-    d > max_exp(T) && (u = T(Inf))
-    d < min_exp(T) && (u = T(0))
+    u = vifelse(d > max_exp(T), T(Inf), u)
+    u = vifelse(d < min_exp(T), T(0), u)
 
-    return u
+    u
 end
diff --git a/src/hyp.jl b/src/hyp.jl
index 4e30def..84f4a4f 100644
--- a/src/hyp.jl
+++ b/src/hyp.jl
@@ -8,15 +8,16 @@ over_sch(::Type{Float32}) = 89f0
 
 Compute hyperbolic sine of `x`.
 """
-function sinh(x::T) where {T<:Union{Float32,Float64}}
+function sinh(x::FloatType)
+    T = eltype(x)
     u = abs(x)
     d = expk2(Double(u))
     d = dsub(d, drec(d))
     u = T(d) * T(0.5)
-    u = abs(x) > over_sch(T) ? T(Inf) : u
-    u = isnan(u) ? T(Inf) : u
+    u = vifelse(abs(x) > over_sch(T), T(Inf), u)
+    u = vifelse(isnan(u), T(Inf), u)
     u = flipsign(u, x)
-    u = isnan(x) ? T(NaN) : u
+    u = vifelse(isnan(x), T(NaN), u)
     return u
 end
 
@@ -27,14 +28,15 @@ end
 
 Compute hyperbolic cosine of `x`.
 """
-function cosh(x::T) where {T<:Union{Float32,Float64}}
+function cosh(x::FloatType)
+    T = eltype(x)
     u = abs(x)
     d = expk2(Double(u))
     d = dadd(d, drec(d))
     u = T(d) * T(0.5)
-    u = abs(x) > over_sch(T) ? T(Inf) : u
-    u = isnan(u) ? T(Inf) : u
-    u = isnan(x) ? T(NaN) : u
+    u = vifelse(abs(x) > over_sch(T), T(Inf), u)
+    u = vifelse(isnan(u), T(Inf), u)
+    u = vifelse(isnan(x), T(NaN), u)
     return u
 end
 
@@ -48,16 +50,17 @@ over_th(::Type{Float32}) = 18.714973875f0
 
 Compute hyperbolic tangent of `x`.
 """
-function tanh(x::T) where {T<:Union{Float32,Float64}}
+function tanh(x::FloatType)
+    T = eltype(x)
     u = abs(x)
     d = expk2(Double(u))
     e = drec(d)
     d = ddiv(dsub(d, e), dadd(d, e))
     u = T(d)
-    u = abs(x) > over_th(T) ? T(1.0) : u
-    u = isnan(u) ? T(1) : u
+    u = vifelse(abs(x) > over_th(T), T(1.0), u)
+    u = vifelse(isnan(u), T(1), u)
     u = flipsign(u, x)
-    u = isnan(x) ? T(NaN) : u
+    u = vifelse(isnan(x), T(NaN), u)
     return u
 end
 
@@ -68,20 +71,22 @@ end
 
 Compute the inverse hyperbolic sine of `x`.
 """
-function asinh(x::T) where {T<:Union{Float32,Float64}}
+function asinh(x::V) where V <: FloatType
+    T = eltype(x)
     y = abs(x)
 
-    d = y > 1 ? drec(x) : Double(y, T(0.0))
+    yg1 = y > 1
+    d = vifelse(yg1, drec(x), Double(y, V(0.0)))
     d = dsqrt(dadd2(dsqu(d), T(1.0)))
-    d = y > 1 ? dmul(d, y) : d
+    d = vifelse(yg1, dmul(d, y), d)
 
     d = logk2(dnormalize(dadd(d, x)))
     y = T(d)
 
-    y = (abs(x) > SQRT_MAX(T) || isnan(y)) ? flipsign(T(Inf), x) : y
-    y = isnan(x) ? T(NaN) : y
-    y = isnegzero(x) ? T(-0.0) : y
-    
+    y = vifelse(((abs(x) > SQRT_MAX(T)) | isnan(y)), flipsign(T(Inf), x), y)
+    y = vifelse(isnan(x), T(NaN), y)
+    y = vifelse(isnegzero(x), T(-0.0), y)
+
     return y
 end
 
@@ -92,14 +97,15 @@ end
 
 Compute the inverse hyperbolic cosine of `x`.
 """
-function acosh(x::T) where {T<:Union{Float32,Float64}}
+function acosh(x::FloatType)
+    T = eltype(x)
     d = logk2(dadd2(dmul(dsqrt(dadd2(x, T(1.0))), dsqrt(dsub2(x, T(1.0)))), x))
     y = T(d)
 
-    y = (x > SQRT_MAX(T) || isnan(y)) ? T(Inf) : y
-    y = x == T(1.0) ? T(0.0) : y
-    y = x < T(1.0) ? T(NaN) : y
-    y = isnan(x) ? T(NaN) : y
+    y = vifelse(((x > SQRT_MAX(T)) | isnan(y)), T(Inf), y)
+    y = vifelse(x == T(1.0), T(0.0), y)
+    y = vifelse(x < T(1.0), T(NaN), y)
+    y = vifelse(isnan(x), T(NaN), y)
 
     return y
 end
@@ -111,14 +117,15 @@ end
 
 Compute the inverse hyperbolic tangent of `x`.
 """
-function atanh(x::T) where {T<:Union{Float32,Float64}}
+function atanh(x::FloatType)
+    T = eltype(x)
     u = abs(x)
     d = logk2(ddiv(dadd2(T(1.0), u), dsub2(T(1.0), u)))
-    u = u > T(1.0) ? T(NaN) : (u == T(1.0) ? T(Inf) : T(d) * T(0.5))
+    u = vifelse(u > T(1.0), T(NaN), vifelse(u == T(1.0), T(Inf), T(d) * T(0.5)))
 
-    u = isinf(x) || isnan(u) ? T(NaN) : u
+    u = vifelse(isinf(x) | isnan(u), T(NaN), u)
     u = flipsign(u, x)
-    u = isnan(x) ? T(NaN) : u
+    u = vifelse(isnan(x), T(NaN), u)
 
     return u
 end
diff --git a/src/log.jl b/src/log.jl
index 8f4ca1a..24b255c 100644
--- a/src/log.jl
+++ b/src/log.jl
@@ -19,11 +19,11 @@ where `significand ∈ [1, 2)`.
     * `x = ±Inf`  returns `INT_MAX`
     * `x = NaN`  returns `FP_ILOGBNAN`
 """
-function ilogb(x::T) where {T<:Union{Float32,Float64}}
+function ilogb(x::FloatType)
     e = ilogbk(abs(x))
-    x == 0 && (e = FP_ILOGB0)
-    isnan(x) && (e = FP_ILOGBNAN)
-    isinf(x) && (e = INT_MAX)
+    e = vifelse(x == 0, FP_ILOGB0, e)
+    e = vifelse(isnan(x), FP_ILOGBNAN, e)
+    e = vifelse(isinf(x), INT_MAX, e)
     return e
 end
 
@@ -34,12 +34,13 @@ end
 
 Returns the base `10` logarithm of `x`.
 """
-function log10(a::T) where {T<:Union{Float32,Float64}}
-    x = T(dmul(logk(a), MDLN10E(T)))
+function log10(a::V) where V <: FloatType
+    T = eltype(a)
+    x = V(dmul(logk(a), MDLN10E(T)))
 
-    isinf(a) && (x = T(Inf))
-    (a < 0 || isnan(a)) && (x = T(NaN))
-    a == 0 && (x = T(-Inf))
+    x = vifelse(isinf(a), T(Inf), x)
+    x = vifelse((a < 0) | isnan(a), T(NaN), x)
+    x = vifelse(a == 0, T(-Inf), x)
 
     return x
 end
@@ -51,12 +52,13 @@ end
 
 Returns the base `2` logarithm of `x`.
 """
-function log2(a::T) where {T<:Union{Float32,Float64}}
-    u = T(dmul(logk(a), MDLN2E(T)))
+function log2(a::V) where V <: FloatType
+    T = eltype(a)
+    u = V(dmul(logk(a), MDLN2E(T)))
 
-    isinf(a) && (u = T(Inf))
-    (a < 0 || isnan(a)) && (u = T(NaN))
-    a == 0 && (u = T(-Inf))
+    u = vifelse(isinf(a), T(Inf), u)
+    u = vifelse((a < 0) | isnan(a), T(NaN), u)
+    u = vifelse(a == 0, T(-Inf), u)
 
     return u
 end
@@ -71,20 +73,21 @@ const over_log1p(::Type{Float32}) = 1f38
 
 Accurately compute the natural logarithm of 1+x.
 """
-function log1p(a::T) where {T<:Union{Float32,Float64}}
+function log1p(a::V) where {V<:FloatType}
+    T = eltype(a)
     x = T(logk2(dadd2(a, T(1.0))))
 
-    a > over_log1p(T) && (x = T(Inf))
-    a < -1 && (x = T(NaN))
-    a == -1 && (x = T(-Inf))
-    isnegzero(a) && (x = T(-0.0))
+    x = vifelse(a > over_log1p(T), T(Inf), x)
+    x = vifelse(a < -1, T(NaN), x)
+    x = vifelse(a == -1, T(-Inf), x)
+    x = vifelse(isnegzero(a), T(-0.0), x)
 
     return x
 end
 
 
 
-@inline function log_kernel(x::Float64)
+@inline function log_kernel(x::FloatType64)
     c7 = 0.1532076988502701353
     c6 = 0.1525629051003428716
     c5 = 0.1818605932937785996
@@ -95,7 +98,7 @@ end
     return @horner x c1 c2 c3 c4 c5 c6 c7
 end
 
-@inline function log_kernel(x::Float32)
+@inline function log_kernel(x::FloatType32)
     c3 = 0.3027294874f0
     c2 = 0.3996108174f0
     c1 = 0.6666694880f0
@@ -108,27 +111,28 @@ end
 Compute the natural logarithm of `x`. The inverse of the natural logarithm is
 the natural expoenential function `exp(x)`
 """
-function log(d::T) where {T<:Union{Float32,Float64}}
+function log(d::V) where {V <: FloatType}
+    T = eltype(d)
     o = d < floatmin(T)
-    o && (d *= T(Int64(1) << 32) * T(Int64(1) << 32))
+    d = vifelse(o, d * T(Int64(1) << 32) * T(Int64(1) << 32), d)
 
     e = ilogb2k(d * T(1.0/0.75))
     m = ldexp3k(d, -e)
-    o && (e -= 64)
+    e = vifelse(o, e - 64, e)
 
     x  = ddiv(dadd2(T(-1.0), m), dadd2(T(1.0), m))
     x2 = x.hi*x.hi
 
     t = log_kernel(x2)
 
-    s = dmul(MDLN2(T), T(e))
+    s = dmul(MDLN2(T), convert(V,e))
     s = dadd(s, scale(x, T(2.0)))
     s = dadd(s, x2*x.hi*t)
-    r = T(s)
+    r = V(s)
 
-    isinf(d) && (r = T(Inf))
-    (d < 0 || isnan(d)) && (r = T(NaN))
-    d == 0 && (r = -T(Inf))
+    r = vifelse(isinf(d), T(Inf), r)
+    r = vifelse((d < 0) | isnan(d), T(NaN), r)
+    r = vifelse(d == 0, T(-Inf), r)
 
     return r
 end
@@ -151,7 +155,7 @@ end
 # 1, we multiply  `m` by `2` and subtract 1 for the exponent `e` when `m` is
 # less than `sqrt(2)/2`
 
-@inline function log_fast_kernel(x::Float64)
+@inline function log_fast_kernel(x::FloatType64)
     c8 = 0.153487338491425068243146
     c7 = 0.152519917006351951593857
     c6 = 0.181863266251982985677316
@@ -163,7 +167,7 @@ end
     return @horner x c1 c2 c3 c4 c5 c6 c7 c8
 end
 
-@inline function log_fast_kernel(x::Float32)
+@inline function log_fast_kernel(x::FloatType32)
     c5 = 0.2392828464508056640625f0
     c4 = 0.28518211841583251953125f0
     c3 = 0.400005877017974853515625f0
@@ -178,24 +182,26 @@ end
 Compute the natural logarithm of `x`. The inverse of the natural logarithm is
 the natural expoenential function `exp(x)`
 """
-function log_fast(d::T) where {T<:Union{Float32,Float64}}
+function log_fast(d::FloatType)
+    T = eltype(d)
     o = d < floatmin(T)
-    o && (d *= T(Int64(1) << 32) * T(Int64(1) << 32))
+    d = vifelse(o, d * T(Int64(1) << 32) * T(Int64(1) << 32), d)
 
     e = ilogb2k(d * T(1.0/0.75))
     m = ldexp3k(d, -e)
-    o && (e -= 64)
+    e = vifelse(o, e - 64, e)
 
     x  = (m - 1) / (m + 1)
     x2 = x * x
 
     t = log_fast_kernel(x2)
-    
-    x = x * t + T(MLN2) * e
-    
-    isinf(d) && (x = T(Inf))
-    (d < 0 || isnan(d)) && (x = T(NaN))
-    d == 0 && (x = -T(Inf))
+
+    x = muladd(x, t, T(MLN2) * e)
+
+
+    x = vifelse(isinf(d), T(Inf), x)
+    x = vifelse((d < 0) | isnan(d), T(NaN), x)
+    x = vifelse(d == 0, T(-Inf), x)
 
     return x
 end
diff --git a/src/misc.jl b/src/misc.jl
index 72c4bf4..8e6c401 100644
--- a/src/misc.jl
+++ b/src/misc.jl
@@ -4,68 +4,71 @@
 
 Exponentiation operator, returns `x` raised to the power `y`.
 """
-function pow(x::T, y::T) where {T<:Union{Float32,Float64}}
-    yi = unsafe_trunc(Int, y)
+function pow(x::V, y::V) where {V <: FloatType}
+    T = eltype(x)
+    yi = unsafe_trunc(EquivalentInteger(T), y)
     yisint = yi == y
-    yisodd = isodd(yi) && yisint
-    
+    yisodd = isodd(yi) & yisint
+
     result = expk(dmul(logk(abs(x)), y))
 
-    result = isnan(result) ? T(Inf) : result
-    result *= (x > 0 ? T(1.0) : (!yisint ? T(NaN) : (yisodd ? -T(1.0) : T(1.0))))
+    result = vifelse(isnan(result), T(Inf), result)
+    result = vifelse(x > 0, result, vifelse(!yisint, T(NaN), vifelse(yisodd, -result, result)))
 
     efx = flipsign(abs(x) - 1, y)
-    isinf(y) && (result = efx < 0 ? T(0.0) : (efx == 0 ? T(1.0) : T(Inf)))
-    (isinf(x) || x == 0) && (result = (yisodd ? _sign(x) : T(1.0)) * ((x == 0 ? -y : y) < 0 ? T(0.0) : T(Inf)))
-    (isnan(x) || isnan(y)) && (result = T(NaN))
-    (y == 0 || x == 1) && (result = T(1.0))
+    result = vifelse(isinf(y), vifelse(efx < 0, T(0.0), vifelse(efx == 0, T(1.0), T(Inf))), result)
+    result = vifelse(isinf(x) | (x == 0), vifelse(yisodd, _sign(x), T(1.0)) * vifelse(vifelse(x == 0, -y, y) < 0, T(0.0), T(Inf)), result)
+    result = vifelse(isnan(x) | isnan(y), T(NaN), result)
+    result = vifelse((y == 0) | (x == 1), T(1.0), result)
 
     return result
 end
 
 
 
-let
-global cbrt_fast
-global cbrt
 
-c6d = -0.640245898480692909870982
-c5d = 2.96155103020039511818595
-c4d = -5.73353060922947843636166
-c3d = 6.03990368989458747961407
-c2d = -3.85841935510444988821632
-c1d = 2.2307275302496609725722
 
-c6f = -0.601564466953277587890625f0
-c5f =  2.8208892345428466796875f0
-c4f = -5.532182216644287109375f0
-c3f =  5.898262500762939453125f0
-c2f = -3.8095417022705078125f0
-c1f =  2.2241256237030029296875f0
 
-global @inline cbrt_kernel(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d
-global @inline cbrt_kernel(x::Float32) = @horner x c1f c2f c3f c4f c5f c6f
+@inline function cbrt_kernel(x::FloatType64)
+    c6d = -0.640245898480692909870982
+    c5d = 2.96155103020039511818595
+    c4d = -5.73353060922947843636166
+    c3d = 6.03990368989458747961407
+    c2d = -3.85841935510444988821632
+    c1d = 2.2307275302496609725722
+    @horner x c1d c2d c3d c4d c5d c6d
+end
+@inline function cbrt_kernel(x::FloatType32)
+    c6f = -0.601564466953277587890625f0
+    c5f =  2.8208892345428466796875f0
+    c4f = -5.532182216644287109375f0
+    c3f =  5.898262500762939453125f0
+    c2f = -3.8095417022705078125f0
+    c1f =  2.2241256237030029296875f0
+    @horner x c1f c2f c3f c4f c5f c6f
+end
 
 """
     cbrt_fast(x)
 
 Return `x^{1/3}`.
 """
-function cbrt_fast(d::T) where {T<:Union{Float32,Float64}}
+function cbrt_fast(d::V) where V <: FloatType
+    T  = eltype(d)
     e  = ilogbk(abs(d)) + 1
     d  = ldexp2k(d, -e)
     r  = (e + 6144) % 3
-    q  = r == 1 ? T(M2P13) : T(1)
-    q  = r == 2 ? T(M2P23) : q
+    q  = vifelse(r == 1, V(M2P13), V(1))
+    q  = vifelse(r == 2, V(M2P23), q)
     q  = ldexp2k(q, (e + 6144) ÷ 3 - 2048)
     q  = flipsign(q, d)
     d  = abs(d)
     x  = cbrt_kernel(d)
     y  = x * x
     y  = y * y
-    x -= (d * y - x) * T(1 / 3)
+    x  = muladd(T(-1/3), muladd(d, y, - x), x)
     y  = d * x * x
-    y  = (y - T(2 / 3) * y * (y * x - 1)) * q
+    y  = (y - T(2 / 3) * y * muladd(y, x, T(- 1))) * q
 end
 
 
@@ -74,34 +77,34 @@ end
 
 Return `x^{1/3}`. The prefix operator `∛` is equivalent to `cbrt`.
 """
-function cbrt(d::T) where {T<:Union{Float32,Float64}}
+function cbrt(d::V) where V <: FloatType
+    T  = eltype(d)
     e  = ilogbk(abs(d)) + 1
     d  = ldexp2k(d, -e)
     r  = (e + 6144) % 3
-    q2 = r == 1 ? MD2P13(T) : Double(T(1))
-    q2 = r == 2 ? MD2P23(T) : q2
+    q2 = vifelse(r == 1, MD2P13(T), Double(T(1)))
+    q2 = vifelse(r == 2, MD2P23(T), q2)
     q2 = flipsign(q2, d)
     d  = abs(d)
     x  = cbrt_kernel(d)
     y  = x * x
     y  = y * y
-    x -= (d * y - x) * T(1 / 3)
+    x  = muladd(T(-1 / 3), muladd(d, y, - x), x)
     z  = x
     u  = dsqu(x)
     u  = dsqu(u)
     u  = dmul(u, d)
     u  = dsub(u, x)
-    y  = T(u)
-    y  = -T(2 / 3) * y * z
+    y  = V(u)
+    y  = T(-2 / 3) * y * z
     v  = dadd(dsqu(z), y)
     v  = dmul(v, d)
     v  = dmul(v, q2)
-    z  = ldexp2k(T(v), (e + 6144) ÷ 3 - 2048)
-    isinf(d) && (z = flipsign(T(Inf), q2.hi))
-    d == 0   && (z = flipsign(T(0), q2.hi))
+    z  = ldexp2k(V(v), (e + 6144) ÷ 3 - 2048)
+    z  = vifelse(isinf(d), flipsign(T(Inf), q2.hi), z)
+    z  = vifelse(d == 0, flipsign(T(0), q2.hi), z)
     return z
 end
-end
 
 
 
@@ -110,12 +113,13 @@ end
 
 Compute the hypotenuse `\\sqrt{x^2+y^2}` avoiding overflow and underflow.
 """
-function hypot(x::T, y::T) where {T<:IEEEFloat}
-    x = abs(x)
-    y = abs(y)
-    if x < y
-       x, y = y, x
-    end
-    r = (x == 0) ? y : y / x
-    x * sqrt(T(1.0) + r * r)
+function hypot(x::T, y::T) where {T<:vIEEEFloat}
+    a = abs(x)
+    b = abs(y)
+
+    x = min(a,b)
+    y = max(a,b)
+
+    r = vifelse(x == 0, y, y / x)
+    x * _sqrt(muladd(r, r, T(1.0)))
 end
diff --git a/src/priv.jl b/src/priv.jl
index 5652cdb..78af53f 100644
--- a/src/priv.jl
+++ b/src/priv.jl
@@ -23,39 +23,50 @@ exponent amount back in.
     q = q - (m << offset)
     m, q
 end
-@inline split_exponent(::Type{Float64}, q::Int) = _split_exponent(q, UInt(9), UInt(31), UInt(2))
-@inline split_exponent(::Type{Float32}, q::Int) = _split_exponent(q, UInt(6), UInt(31), UInt(2))
+@inline split_exponent(::Type{Float64}, q::IntegerType) = _split_exponent(q, UInt(9), UInt(31), UInt(2))
 
+@inline split_exponent(::Type{Float32}, q::IntegerType) = _split_exponent(q, UInt(6), UInt(31), UInt(2))
 """
     ldexpk(a, n)
 
 Computes `a × 2^n`.
 """
-@inline function ldexpk(x::T, q::Int) where {T<:Union{Float32,Float64}}
+@inline function ldexpk(x::FloatType, q::IntegerType)
+    T = eltype(x)
     bias = exponent_bias(T)
     emax = exponent_raw_max(T)
     m, q = split_exponent(T, q)
     m += bias
-    m = ifelse(m < 0, 0, m)
-    m = ifelse(m > emax, emax, m)
+    m = vifelse(m < 0, 0, m)
+    m = vifelse(m > emax, emax, m)
     q += bias
     u = integer2float(T, m)
-    x = x * u * u * u * u
+    u² = u * u
+    x = x * u² * u²
     u = integer2float(T, q)
     x * u
 end
 
-@inline function ldexp2k(x::T, e::Int) where {T<:Union{Float32,Float64}}
-    x * pow2i(T, e >> 1) * pow2i(T, e - (e >> 1))
+@inline function ldexp2k(x::FloatType, e::IntegerType)
+    x * pow2i(eltype(x), e >> 1) * pow2i(eltype(x), e - (e >> 1))
 end
 
 @inline function ldexp3k(x::T, e::Int) where {T<:Union{Float32,Float64}}
     reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % uinttype(T))
 end
+@generated function ldexp3k(x::Vec{N,T}, e::IntegerType) where {N,T<:Union{Float32,Float64}}
+    UT = Base.uinttype(T)
+    quote
+        $(Expr(:meta,:inline))
+        reinterpret(Vec{$N,$T}, reinterpret(Vec{$N,$UT}, x) +
+            Vec{$N,$UT}($(Expr(:tuple, [:((Int64(e[$n]) << $(significand_bits(T))) % $UT) for n ∈ 1:N]...)))
+        )
+    end
+end
 
 # threshold values for `ilogbk`
-const threshold_exponent(::Type{Float64}) = 300
-const threshold_exponent(::Type{Float32}) = 64
+threshold_exponent(::Type{Float64}) = 300
+threshold_exponent(::Type{Float32}) = 64
 
 """
     ilogbk(x) -> Int
@@ -67,90 +78,85 @@ words this returns the binary exponent of `x` so that
 
 where `significand ∈ [1, 2)`.
 """
-@inline function ilogbk(d::T) where {T<:Union{Float32,Float64}}
-    m = d < T(2)^-threshold_exponent(T)
-    d = ifelse(m, d * T(2)^threshold_exponent(T), d)
+@inline function ilogbk(d::FloatType)
+    T = eltype(d)
+    m = d < (T(2)^-threshold_exponent(T))
+    d = vifelse(m, d * T(2)^threshold_exponent(T), d)
     q = float2integer(d) & exponent_raw_max(T)
-    q = ifelse(m, q - (threshold_exponent(T) + exponent_bias(T)), q - exponent_bias(T))
+    q = vifelse(m, q - (threshold_exponent(T) + exponent_bias(T)), q - exponent_bias(T))
 end
 
 # similar to ilogbk, but argument has to be a normalized float value
-@inline function ilogb2k(d::T) where {T<:Union{Float32,Float64}}
-    (float2integer(d) & exponent_raw_max(T)) - exponent_bias(T)
+@inline function ilogb2k(d::FloatType)
+    T = eltype(d)
+    I = EquivalentInteger(T)
+    (float2integer(d) & I(exponent_raw_max(T))) - I(exponent_bias(T))
 end
 
+const c20d =  1.06298484191448746607415e-05
+const c19d = -0.000125620649967286867384336
+const c18d =  0.00070557664296393412389774
+const c17d = -0.00251865614498713360352999
+const c16d =  0.00646262899036991172313504
+const c15d = -0.0128281333663399031014274
+const c14d =  0.0208024799924145797902497
+const c13d = -0.0289002344784740315686289
+const c12d =  0.0359785005035104590853656
+const c11d = -0.041848579703592507506027
+const c10d =  0.0470843011653283988193763
+const c9d  = -0.0524914210588448421068719
+const c8d  =  0.0587946590969581003860434
+const c7d  = -0.0666620884778795497194182
+const c6d  =  0.0769225330296203768654095
+const c5d  = -0.0909090442773387574781907
+const c4d  =  0.111111108376896236538123
+const c3d  = -0.142857142756268568062339
+const c2d  =  0.199999999997977351284817
+const c1d =  -0.333333333333317605173818
+
+const c9f = -0.00176397908944636583328247f0
+const c8f =  0.0107900900766253471374512f0
+const c7f = -0.0309564601629972457885742f0
+const c6f =  0.0577365085482597351074219f0
+const c5f = -0.0838950723409652709960938f0
+const c4f =  0.109463557600975036621094f0
+const c3f = -0.142626821994781494140625f0
+const c2f =  0.199983194470405578613281f0
+const c1f = -0.333332866430282592773438f0
+
+@inline atan2k_fast_kernel(x::FloatType64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
+@inline atan2k_fast_kernel(x::FloatType32) = @horner x c1f c2f c3f c4f c5f c6f c7f c8f c9f
+
+@inline function atan2k_fast(y::V, x::V) where {T<:Union{Float32,Float64},V<:Union{T,Vec{<:Any,T}}}
+    xl0 = x < 0
+    q = vifelse(xl0, -2, 0)
+    x = vifelse(xl0, -x, x)
+    ygx = y > x
+    t = x
+    x = vifelse(ygx, y, x)
+    y = vifelse(ygx, -t, y)
+    q = vifelse(ygx, q+1, q)
 
-let
-global atan2k_fast
-global atan2k
-
-c20d =  1.06298484191448746607415e-05
-c19d = -0.000125620649967286867384336
-c18d =  0.00070557664296393412389774
-c17d = -0.00251865614498713360352999
-c16d =  0.00646262899036991172313504
-c15d = -0.0128281333663399031014274
-c14d =  0.0208024799924145797902497
-c13d = -0.0289002344784740315686289
-c12d =  0.0359785005035104590853656
-c11d = -0.041848579703592507506027
-c10d =  0.0470843011653283988193763
-c9d  = -0.0524914210588448421068719
-c8d  =  0.0587946590969581003860434
-c7d  = -0.0666620884778795497194182
-c6d  =  0.0769225330296203768654095
-c5d  = -0.0909090442773387574781907
-c4d  =  0.111111108376896236538123
-c3d  = -0.142857142756268568062339
-c2d  =  0.199999999997977351284817
-c1d =  -0.333333333333317605173818
-
-c9f = -0.00176397908944636583328247f0
-c8f =  0.0107900900766253471374512f0
-c7f = -0.0309564601629972457885742f0
-c6f =  0.0577365085482597351074219f0
-c5f = -0.0838950723409652709960938f0
-c4f =  0.109463557600975036621094f0
-c3f = -0.142626821994781494140625f0
-c2f =  0.199983194470405578613281f0
-c1f = -0.333332866430282592773438f0
-
-global @inline atan2k_fast_kernel(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
-global @inline atan2k_fast_kernel(x::Float32) = @horner x c1f c2f c3f c4f c5f c6f c7f c8f c9f
-
-@inline function atan2k_fast(y::T, x::T) where {T<:Union{Float32,Float64}}
-    q = 0
-    if x < 0
-        x = -x
-        q = -2
-    end
-    if y > x
-        t = x; x = y
-        y = -t
-        q += 1
-    end
     s = y / x
     t = s * s
     u = atan2k_fast_kernel(t)
-    t = u * t * s + s
-    q * T(PI_2) + t
+    t = muladd(u, t * s, s)
+    muladd(q, T(PI_2), t)
 end
 
 
-global @inline atan2k_kernel(x::Double{Float64}) = @horner x.hi c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
-global @inline atan2k_kernel(x::Double{Float32}) = dadd(c1f, x.hi * (@horner x.hi c2f c3f c4f c5f c6f c7f c8f c9f))
+@inline atan2k_kernel(x::Double{<:FloatType64}) = @horner x.hi c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
+@inline atan2k_kernel(x::Double{<:FloatType32}) = dadd(c1f, x.hi * (@horner x.hi c2f c3f c4f c5f c6f c7f c8f c9f))
 
-@inline function atan2k(y::Double{T}, x::Double{T}) where {T<:Union{Float32,Float64}}
-    q = 0
-    if x < 0
-        x = -x
-        q = -2
-    end
-    if y > x
-        t = x; x = y
-        y = -t
-        q += 1
-    end
+@inline function atan2k(y::Double{V}, x::Double{V}) where {T,V<:Union{T,Vec{<:Any,T}}}
+    xl0 = x < 0
+    q = vifelse(xl0, -2, 0)
+    x = vifelse(xl0, -x, x)
+    ygx = y > x
+    t = x
+    x = vifelse(ygx, y, x)
+    y = vifelse(ygx, -t, y)
+    q = vifelse(ygx, q+1, q)
 
     s = ddiv(y, x)
     t = dsqu(s)
@@ -160,18 +166,17 @@ global @inline atan2k_kernel(x::Double{Float32}) = dadd(c1f, x.hi * (@horner x.h
 
     t = dmul(t, u)
     t = dmul(s, dadd(T(1.0), t))
-    T <: Float64 && abs(s.hi) < 1e-200 && (t = s)
-    t = dadd(dmul(T(q), MDPI2(T)), t)
+    T <: Float64 && (t = vifelse(abs(s.hi) < 1e-200, s, t))
+    t = dadd(dmul(V(q), MDPI2(T)), t)
     return t
 end
-end
 
 
 
 const under_expk(::Type{Float64}) = -1000.0
 const under_expk(::Type{Float32}) = -104f0
 
-@inline function expk_kernel(x::Float64)
+@inline function expk_kernel(x::FloatType64)
     c10 = 2.51069683420950419527139e-08
     c9  = 2.76286166770270649116855e-07
     c8  = 2.75572496725023574143864e-06
@@ -185,7 +190,7 @@ const under_expk(::Type{Float32}) = -104f0
     return @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10
 end
 
-@inline function  expk_kernel(x::Float32)
+@inline function  expk_kernel(x::FloatType32)
     c5 = 0.00136324646882712841033936f0
     c4 = 0.00836596917361021041870117f0
     c3 = 0.0416710823774337768554688f0
@@ -194,28 +199,28 @@ end
     return @horner x c1 c2 c3 c4 c5
 end
 
-@inline function expk(d::Double{T}) where {T<:Union{Float32,Float64}}
-    q = round(T(d) * T(MLN2E))
-    qi = unsafe_trunc(Int, q)
+@inline function expk(d::Double{V}) where {T<:Union{Float32,Float64},V<:Union{T,Vec{<:Any,T}}}
+    q = round(V(d) * V(MLN2E))
+    qi = unsafe_trunc(EquivalentInteger(T), q)
 
     s = dadd(d, -q * L2U(T))
     s = dadd(s, -q * L2L(T))
 
     s = dnormalize(s)
 
-    u = expk_kernel(T(s))
+    u = expk_kernel(V(s))
 
     t = dadd(s, dmul(dsqu(s), u))
     t = dadd(T(1.0), t)
-    u = ldexpk(T(t), qi)
+    u = ldexpk(V(t), qi)
 
-    (d.hi < under_expk(T)) && (u = T(0.0))
+    u = vifelse(d.hi < under_expk(T), V(0.0), u)
     return u
 end
 
 
 
-@inline function expk2_kernel(x::Double{Float64})
+@inline function expk2_kernel(x::Double{<:FloatType64})
     c11 = 0.1602472219709932072e-9
     c10  = 0.2092255183563157007e-8
     c9  = 0.2505230023782644465e-7
@@ -231,7 +236,7 @@ end
     return dadd(dmul(x, u), c1)
 end
 
-@inline function  expk2_kernel(x::Double{Float32})
+@inline function  expk2_kernel(x::Double{<:FloatType32})
     c5 = 0.1980960224f-3
     c4 = 0.1394256484f-2
     c3 = 0.8333456703f-2
@@ -241,7 +246,7 @@ end
     return dadd(dmul(x, u), c1)
 end
 
-@inline function expk2(d::Double{T}) where {T<:Union{Float32,Float64}}
+@inline function expk2(d::Double{V}) where {T<:Union{Float32,Float64},V<:Union{T,Vec{<:Any,T}}}
     q = round(T(d) * T(MLN2E))
     qi = unsafe_trunc(Int, q)
 
@@ -255,13 +260,13 @@ end
 
     t = Double(ldexp2k(t.hi, qi), ldexp2k(t.lo, qi))
 
-    (d.hi < under_expk(T)) && (t = Double(T(0.0)))
+    t = vifelse(d.hi < under_expk(T), Double(V(0.0)), t)
     return t
 end
 
 
 
-@inline function logk2_kernel(x::Float64)
+@inline function logk2_kernel(x::FloatType64)
     c8 = 0.13860436390467167910856
     c7 = 0.131699838841615374240845
     c6 = 0.153914168346271945653214
@@ -273,7 +278,7 @@ end
     return @horner x c1 c2 c3 c4 c5 c6 c7 c8
 end
 
-@inline function logk2_kernel(x::Float32)
+@inline function logk2_kernel(x::FloatType32)
     c4 = 0.240320354700088500976562f0
     c3 = 0.285112679004669189453125f0
     c2 = 0.400007992982864379882812f0
@@ -281,7 +286,7 @@ end
     return @horner x c1 c2 c3 c4
 end
 
-@inline function logk2(d::Double{T}) where {T<:Union{Float32,Float64}}
+@inline function logk2(d::Double{V}) where {T<:Union{Float32,Float64},V<:Union{T,Vec{<:Any,T}}}
     e  = ilogbk(d.hi * T(1.0/0.75))
     m  = scale(d, pow2i(T, -e))
 
@@ -298,7 +303,7 @@ end
 
 
 
-@inline function logk_kernel(x::Double{Float64})
+@inline function logk_kernel(x::Double{<:FloatType64})
     c10 = 0.116255524079935043668677
     c9 = 0.103239680901072952701192
     c8 = 0.117754809412463995466069
@@ -312,22 +317,23 @@ end
     dadd2(dmul(x, @horner x.hi c2 c3 c4 c5 c6 c7 c8 c9 c10), c1)
 end
 
-@inline function logk_kernel(x::Double{Float32})
+@inline function logk_kernel(x::Double{<:FloatType32})
     c4 = 0.240320354700088500976562f0
     c3 = 0.285112679004669189453125f0
     c2 = 0.400007992982864379882812f0
-    c1 = Double(0.66666662693023681640625f0, 3.69183861259614332084311f-9)    
+    c1 = Double(0.66666662693023681640625f0, 3.69183861259614332084311f-9)
     dadd2(dmul(x, @horner x.hi c2 c3 c4), c1)
 end
 
-@inline function logk(d::T) where {T<:Union{Float32,Float64}}
+@inline function logk(d::FloatType)
+    T = eltype(d)
     o = d < floatmin(T)
-    o && (d *= T(Int64(1) << 32) * T(Int64(1) << 32))
+    d = vifelse(o, d * T(Int64(1) << 32) * T(Int64(1) << 32), d)
 
     e  = ilogb2k(d * T(1.0/0.75))
     m  = ldexp3k(d, -e)
 
-    o && (e -= 64)
+    e = vifelse(o, e - 64, e)
 
     x  = ddiv(dsub2(m, T(1.0)), dadd2(T(1.0), m))
     x2 = dsqu(x)
diff --git a/src/trig.jl b/src/trig.jl
index e81f2fe..8a4b223 100644
--- a/src/trig.jl
+++ b/src/trig.jl
@@ -14,7 +14,7 @@ Compute the cosine of `x`, where the output is in radians.
 """
 function cos end
 
-@inline function sincos_kernel(x::Double{Float64})
+@inline function sincos_kernel(x::Double{<:FloatType64})
     c8 =  2.72052416138529567917983e-15
     c7 = -7.64292594113954471900203e-13
     c6 =  1.60589370117277896211623e-10
@@ -26,7 +26,7 @@ function cos end
     return dadd(c1, x.hi * (@horner x.hi c2 c3 c4 c5 c6 c7 c8))
 end
 
-@inline function sincos_kernel(x::Double{Float32})
+@inline function sincos_kernel(x::Double{<:FloatType32})
     c4 =  2.6083159809786593541503f-06
     c3 = -0.0001981069071916863322258f0
     c2 =  0.00833307858556509017944336f0
@@ -34,7 +34,8 @@ end
     return dadd(c1, x.hi * (@horner x.hi c2 c3 c4))
 end
 
-function sin(d::T) where {T<:Float64}
+function sin(d::V) where V <: FloatType64
+    T = eltype(d)
     qh = trunc(d * (T(M_1_PI) / (1 << 24)))
     ql = round(d * T(M_1_PI) - qh * (1 << 24))
 
@@ -51,17 +52,19 @@ function sin(d::T) where {T<:Float64}
 
     w = sincos_kernel(s)
 
-    v = dmul(t, dadd(T(1.0), dmul(w, s)))
-    u = T(v)
+    v = dmul(t, dadd(V(1.0), dmul(w, s)))
+    u = V(v)
 
-    qli = unsafe_trunc(Int, ql)
-    qli & 1 != 0 && (u = -u)
-    !isinf(d) && (isnegzero(d) || abs(d) > TRIG_MAX(T)) && (u = T(-0.0))
+    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    u = vifelse(qli & 1 != 0, -u, u)
+    u = vifelse((!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))), T(-0.0), u)
 
     return u
 end
 
-function sin(d::T) where {T<:Float32}
+function sin(d::V) where V <: FloatType32
+    T = eltype(d)
+
     q = round(d * T(M_1_PI))
 
     s = dadd2(d, q * -PI_A(T))
@@ -75,16 +78,17 @@ function sin(d::T) where {T<:Float32}
     w = sincos_kernel(s)
 
     v = dmul(t, dadd(T(1.0), dmul(w, s)))
-    u = T(v)
+    u = V(v)
 
-    qi = unsafe_trunc(Int, q)
-    qi & 1 != 0 && (u = -u)
-    !isinf(d) && (isnegzero(d) || abs(d) > TRIG_MAX(T)) && (u = T(-0.0))
+    qi = unsafe_trunc(EquivalentInteger(T), q)
+    u = vifelse(qi & 1 != 0, -u, u)
+    u = vifelse((!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))), T(-0.0), u)
 
     return u
 end
 
-function cos(d::T) where {T<:Float64}
+function cos(d::V) where V <: FloatType64
+    T = eltype(d)
     d = abs(d)
 
     qh = trunc(d * (T(M_1_PI) / (1 << 23)) - T(0.5) * (T(M_1_PI) / (1 << 23)))
@@ -105,16 +109,17 @@ function cos(d::T) where {T<:Float64}
 
     v = dmul(t, dadd(T(1.0), dmul(w, s)))
 
-    u = T(v)
+    u = V(v)
 
-    qli = unsafe_trunc(Int, ql)
-    qli & 2 == 0 && (u = -u)
-    !isinf(d) && (d > TRIG_MAX(T)) && (u = T(0.0))
+    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    u = vifelse(qli & 2 == 0, -u, u)
+    u = vifelse(!isinf(d) & (d > TRIG_MAX(T)), T(0.0), u)
 
     return u
 end
 
-function cos(d::T) where {T<:Float32}
+function cos(d::V) where V <: FloatType32
+    T = eltype(d)
     d = abs(d)
 
     q = 1 + 2*round(d * T(M_1_PI) - T(0.5))
@@ -131,11 +136,11 @@ function cos(d::T) where {T<:Float32}
 
     v = dmul(t, dadd(T(1.0), dmul(w, s)))
 
-    u = T(v)
+    u = V(v)
 
-    qi = unsafe_trunc(Int, q)
-    qi & 2 == 0 && (u = -u)
-    !isinf(d) && (d > TRIG_MAX(T)) && (u = T(0.0))
+    qi = unsafe_trunc(EquivalentInteger(T), q)
+    u = vifelse(qi & 2 == 0, -u, u)
+    u = vifelse(!isinf(d) & (d > TRIG_MAX(T)), T(0.0), u)
 
     return u
 end
@@ -162,7 +167,7 @@ function cos_fast end
 # we are now in the negative branch of sin(x). Recall that q is just the integer
 # part of d/π and thus we can determine the correct sign using this information.
 
-@inline function sincos_fast_kernel(x::Float64)
+@inline function sincos_fast_kernel(x::FloatType64)
     c9 = -7.97255955009037868891952e-18
     c8 =  2.81009972710863200091251e-15
     c7 = -7.64712219118158833288484e-13
@@ -174,15 +179,16 @@ function cos_fast end
     c1 = -0.166666666666666657414808
     return @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9
 end
-@inline function sincos_fast_kernel(x::Float32)
+@inline function sincos_fast_kernel(x::FloatType32)
     c4 =  2.6083159809786593541503f-06
     c3 = -0.0001981069071916863322258f0
     c2 =  0.00833307858556509017944336f0
-    c1 = -0.166666597127914428710938f0   
+    c1 = -0.166666597127914428710938f0
     return @horner x c1 c2 c3 c4
 end
 
-function sin_fast(d::T) where {T<:Float64}
+function sin_fast(d::FloatType64)
+    T = eltype(d)
     t = d
 
     qh = trunc(d * (T(M_1_PI) / (1 << 24)))
@@ -197,19 +203,20 @@ function sin_fast(d::T) where {T<:Float64}
     d = muladd(qh * (1 << 24) + ql, -PI_D(T), d)
 
     s = d * d
-    
-    qli = unsafe_trunc(Int, ql)
-    qli & 1 != 0 && (d = -d)
-    
+
+    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    d = vifelse(qli & 1 != 0, -d, d)
+
     u = sincos_fast_kernel(s)
     u = muladd(s, u * d, d)
-    
-    !isinf(t) && (isnegzero(t) || abs(t) > TRIG_MAX(T)) && (u = T(-0.0))
+
+    u = vifelse((!isinf(t)) & (isnegzero(t) | (abs(t) > TRIG_MAX(T))), T(-0.0), u)
 
     return u
 end
 
-function sin_fast(d::T) where {T<:Float32}
+function sin_fast(d::FloatType32)
+    T = eltype(d)
     t = d
 
     q = round(d * T(M_1_PI))
@@ -220,25 +227,26 @@ function sin_fast(d::T) where {T<:Float32}
     d = muladd(q , -PI_D(T), d)
 
     s = d * d
-    
-    qli = unsafe_trunc(Int, q)
-    qli & 1 != 0 && (d = -d)
-    
+
+    qli = unsafe_trunc(EquivalentInteger(T), q)
+    d = vifelse(qli & 1 != 0, -d, d)
+
     u = sincos_fast_kernel(s)
 
     u = muladd(s, u * d, d)
-    
-    !isinf(t) && (isnegzero(t) || abs(t) > TRIG_MAX(T)) && (u = T(-0.0))
+
+    u = vifelse((!isinf(t)) & (isnegzero(t) | (abs(t) > TRIG_MAX(T))), T(-0.0), u)
 
     return u
 end
 
-function cos_fast(d::T) where {T<:Float64}
+function cos_fast(d::FloatType64)
+    T = eltype(d)
     t = d
 
     qh = trunc(d * (T(M_1_PI) / (1 << 23)) - T(0.5) * (T(M_1_PI) / (1 << 23)))
     ql = 2*round(d * T(M_1_PI) - T(0.5) - qh * (1 << 23)) + 1
-    
+
     d = muladd(qh , -PI_A(T) * T(0.5) * (1 << 24) , d)
     d = muladd(ql , -PI_A(T) * T(0.5)             , d)
     d = muladd(qh , -PI_B(T) * T(0.5) * (1 << 24) , d)
@@ -249,22 +257,23 @@ function cos_fast(d::T) where {T<:Float64}
 
     s = d * d
 
-    qli = unsafe_trunc(Int, ql)
-    qli & 2 == 0 && (d = -d)
+    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    d = vifelse(qli & 2 == 0, -d, d)
 
     u = sincos_fast_kernel(s)
     u = muladd(s, u * d, d)
 
-    !isinf(t) && (abs(t) > TRIG_MAX(T)) && (u = T(0.0))
+    u = vifelse(!isinf(t) & (abs(t) > TRIG_MAX(T)), T(0.0), u)
 
     return u
 end
 
-function cos_fast(d::T) where {T<:Float32}
+function cos_fast(d::FloatType32)
+    T = eltype(d)
     t = d
 
     q = 1 + 2*round(d * T(M_1_PI) - T(0.5))
-    
+
     d = muladd(q, -PI_A(T) * T(0.5), d)
     d = muladd(q, -PI_B(T) * T(0.5), d)
     d = muladd(q, -PI_C(T) * T(0.5), d)
@@ -272,14 +281,14 @@ function cos_fast(d::T) where {T<:Float32}
 
     s = d * d
 
-    qi = unsafe_trunc(Int, q)
-    qi & 2 == 0 && (d = -d)
+    qi = unsafe_trunc(EquivalentInteger(T), q)
+    d = vifelse(qi & 2 == 0, -d, d)
 
     u = sincos_fast_kernel(s)
 
     u = muladd(s, u * d, d)
 
-    !isinf(t) && (abs(t) > TRIG_MAX(T)) && (u = T(0.0))
+    u = vifelse(!isinf(t) & (abs(t) > TRIG_MAX(T)), T(0.0), u)
 
     return u
 end
@@ -302,7 +311,7 @@ radians, returning a tuple.
 """
 function sincos_fast end
 
-@inline function sincos_a_kernel(x::Float64)
+@inline function sincos_a_kernel(x::FloatType64)
     a6 =  1.58938307283228937328511e-10
     a5 = -2.50506943502539773349318e-08
     a4 =  2.75573131776846360512547e-06
@@ -312,14 +321,14 @@ function sincos_fast end
     return @horner x a1 a2 a3 a4 a5 a6
 end
 
-@inline function sincos_a_kernel(x::Float32)
+@inline function sincos_a_kernel(x::FloatType32)
     a3 = -0.000195169282960705459117889f0
     a2 =  0.00833215750753879547119141f0
     a1 = -0.166666537523269653320312f0
     return @horner x a1 a2 a3
 end
 
-@inline function sincos_b_kernel(x::Float64)
+@inline function sincos_b_kernel(x::FloatType64)
     b7 = -1.13615350239097429531523e-11
     b6 =  2.08757471207040055479366e-09
     b5 = -2.75573144028847567498567e-07
@@ -330,7 +339,7 @@ end
     return @horner x b1 b2 b3 b4 b5 b6 b7
 end
 
-@inline function sincos_b_kernel(x::Float32)
+@inline function sincos_b_kernel(x::FloatType32)
     b5 = -2.71811842367242206819355f-07
     b4 =  2.47990446951007470488548f-05
     b3 = -0.00138888787478208541870117f0
@@ -339,7 +348,8 @@ end
     return @horner x b1 b2 b3 b4 b5
 end
 
-function sincos_fast(d::T) where {T<:Float64}
+function sincos_fast(d::FloatType64)
+    T = eltype(d)
     s = d
 
     qh = trunc(d * ((2 * T(M_1_PI)) / (1 << 24)))
@@ -362,24 +372,32 @@ function sincos_fast(d::T) where {T<:Float64}
 
     rx = t + u
 
-    isnegzero(d) && (rx = T(-0.0))
+    rx = vifelse(isnegzero(d), T(-0.0), rx)
 
     u = sincos_b_kernel(s)
 
-    ry = u * s + T(1.0)
+    ry = muladd(u, s, T(1.0))
 
-    qli = unsafe_trunc(Int, ql)
-    qli & 1 != 0 && (s = ry; ry = rx; rx = s)
-    qli & 2 != 0 && (rx = -rx)
-    (qli + 1) & 2 != 0 && (ry = -ry)
+    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli_odd = qli & 1 != 0
+    s = vifelse(qli_odd, ry, s)
+    ry = vifelse(qli_odd, rx, ry)
+    rx = vifelse(qli_odd, s, rx)
+    rx = vifelse(qli & 2 != 0, -rx, rx)
+    ry = vifelse((qli + 1) & 2 != 0, -ry, ry)
 
-    abs(d) > TRIG_MAX(T) && (rx = ry = T(0.0))
-    isinf(d) && (rx = ry = T(NaN))
+    absd_g_trig_max = abs(d) > TRIG_MAX(T)
+    rx = vifelse(absd_g_trig_max, T(0.0), rx)
+    ry = vifelse(absd_g_trig_max, T(0.0), ry)
+    isinfd = isinf(d)
+    rx = vifelse(isinfd, T(NaN), rx)
+    ry = vifelse(isinfd, T(NaN), ry)
 
     return (rx, ry)
 end
 
-function sincos_fast(d::T) where {T<:Float32}
+function sincos_fast(d::FloatType32)
+    T = eltype(d)
     s = d
 
     q = round(d * (2 * T(M_1_PI)))
@@ -398,24 +416,33 @@ function sincos_fast(d::T) where {T<:Float32}
 
     rx = t + u
 
-    isnegzero(d) && (rx = T(-0.0))
+    rx = vifelse(isnegzero(d), T(-0.0), rx)
 
     u = sincos_b_kernel(s)
 
-    ry = u * s + T(1.0)
+    ry = muladd(u, s, T(1.0))
+
+    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi_isodd = qi & 1 != 0
+    s = vifelse(qi_isodd, ry, s)
+    ry = vifelse(qi_isodd, rx, ry)
+    rx = vifelse(qi_isodd, s, rx)
+    rx = vifelse(qi & 2 != 0, -rx, rx)
+    ry = vifelse((qi + 1) & 2 != 0, -ry, ry)
 
-    qi = unsafe_trunc(Int, q)
-    qi & 1 != 0 && (s = ry; ry = rx; rx = s)
-    qi & 2 != 0 && (rx = -rx)
-    (qi + 1) & 2 != 0 && (ry = -ry)
+    absd_g_trig_max = abs(d) > TRIG_MAX(T)
+    rx = vifelse(absd_g_trig_max, T(0.0), rx)
+    ry = vifelse(absd_g_trig_max, T(0.0), ry)
 
-    abs(d) > TRIG_MAX(T) && (rx = ry = T(0.0))
-    isinf(d) && (rx = ry = T(NaN))
+    isinfd = isinf(d)
+    rx = vifelse(isinfd, T(NaN), rx)
+    ry = vifelse(isinfd, T(NaN), ry)
 
     return (rx, ry)
 end
 
-function sincos(d::T) where {T<:Float64}
+function sincos(d::V) where V <: FloatType64
+    T = eltype(d)
     qh = trunc(d * ((2 * T(M_1_PI)) / (1 << 24)))
     ql = round(d * (2 * T(M_1_PI)) - qh * (1 << 24))
 
@@ -429,35 +456,44 @@ function sincos(d::T) where {T<:Float64}
 
     t  = s
     s  = dsqu(s)
-    sx = T(s)
+    sx = V(s)
 
     u  = sincos_a_kernel(sx)
 
     u *= sx * t.hi
 
     v  = dadd(t, u)
-    rx = T(v)
+    rx = V(v)
 
-    isnegzero(d) && (rx = T(-0.0))
+    rx = vifelse(isnegzero(d), T(-0.0), rx)
 
     u  = sincos_b_kernel(sx)
 
     v  = dadd(T(1.0), dmul(sx, u))
-    ry = T(v)
+    ry = V(v)
 
-    qli = unsafe_trunc(Int, ql)
-    qli & 1 != 0 && (u = ry; ry = rx; rx = u)
-    qli & 2 != 0 && (rx = -rx)
-    (qli + 1) & 2 != 0 && (ry = -ry)
+    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli_odd = qli & 1 != 0
+    u = vifelse(qli_odd, ry, u)
+    ry = vifelse(qli_odd, rx, ry)
+    rx = vifelse(qli_odd, u, rx)
+    rx = vifelse(qli & 2 != 0, -rx, rx)
+    ry = vifelse((qli + 1) & 2 != 0, -ry, ry)
 
-    abs(d) > TRIG_MAX(T) && (rx = ry = T(0.0))
-    isinf(d) && (rx = ry = T(NaN))
+    absd_g_trig_max = abs(d) > TRIG_MAX(T)
+    rx = vifelse(absd_g_trig_max, T(0.0), rx)
+    ry = vifelse(absd_g_trig_max, T(0.0), ry)
+
+    isinfd = isinf(d)
+    rx = vifelse(isinfd, T(NaN), rx)
+    ry = vifelse(isinfd, T(NaN), ry)
 
     return (rx, ry)
 end
 
 
-function sincos(d::T) where {T<:Float32}
+function sincos(d::V) where V <: FloatType32
+    T = eltype(d)
     q = round(d * (2 * T(M_1_PI)))
 
     s = dadd2(d, q * (-PI_A(T) * T(0.5)))
@@ -467,29 +503,38 @@ function sincos(d::T) where {T<:Float32}
 
     t  = s
     s  = dsqu(s)
-    sx = T(s)
+    sx = V(s)
 
     u  = sincos_a_kernel(sx)
 
     u *= sx * t.hi
 
     v  = dadd(t, u)
-    rx = T(v)
+    rx = V(v)
 
-    isnegzero(d) && (rx = T(-0.0))
+    rx = vifelse(isnegzero(d), T(-0.0), rx)
 
     u  = sincos_b_kernel(sx)
 
     v  = dadd(T(1.0), dmul(sx, u))
-    ry = T(v)
+    ry = V(v)
+
+
+    qli = unsafe_trunc(EquivalentInteger(T), q)
+    qli_odd = qli & 1 != 0
+    u = vifelse(qli_odd, ry, u)
+    ry = vifelse(qli_odd, rx, ry)
+    rx = vifelse(qli_odd, u, rx)
+    rx = vifelse(qli & 2 != 0, -rx, rx)
+    ry = vifelse((qli + 1) & 2 != 0, -ry, ry)
 
-    qi = unsafe_trunc(Int, q)
-    qi & 1 != 0 && (u = ry; ry = rx; rx = u)
-    qi & 2 != 0 && (rx = -rx)
-    (qi + 1) & 2 != 0 && (ry = -ry)
+    absd_g_trig_max = abs(d) > TRIG_MAX(T)
+    rx = vifelse(absd_g_trig_max, T(0.0), rx)
+    ry = vifelse(absd_g_trig_max, T(0.0), ry)
 
-    abs(d) > TRIG_MAX(T) && (rx = ry = T(0.0))
-    isinf(d) && (rx = ry = T(NaN))
+    isinfd = isinf(d)
+    rx = vifelse(isinfd, T(NaN), rx)
+    ry = vifelse(isinfd, T(NaN), ry)
 
     return (rx, ry)
 end
@@ -509,7 +554,7 @@ Compute the tangent of `x`, where the output is in radians.
 """
 function tan_fast end
 
-@inline function tan_fast_kernel(x::Float64)
+@inline function tan_fast_kernel(x::FloatType64)
     c16 =  9.99583485362149960784268e-06
     c15 = -4.31184585467324750724175e-05
     c14 =  0.000103573238391744000389851
@@ -529,7 +574,7 @@ function tan_fast end
     return @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16
 end
 
-@inline function tan_fast_kernel(x::Float32)
+@inline function tan_fast_kernel(x::FloatType32)
     c7 =  0.00446636462584137916564941f0
     c6 = -8.3920182078145444393158f-05
     c5 =  0.0109639242291450500488281f0
@@ -540,7 +585,8 @@ end
     return @horner x c1 c2 c3 c4 c5 c6 c7
 end
 
-function tan_fast(d::T) where {T<:Float64}
+function tan_fast(d::FloatType64)
+    T = eltype(d)
     qh = trunc(d * (2 * T(M_1_PI)) / (1 << 24))
     ql = round(d * (2 * T(M_1_PI)) - qh * (1 << 24))
 
@@ -554,25 +600,30 @@ function tan_fast(d::T) where {T<:Float64}
 
     s = x * x
 
-    qli = unsafe_trunc(Int, ql)
-    qli & 1 != 0 && (x = -x)
+    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli_odd = qli & 1 != 0
+    x = vifelse(qli_odd, -x, x)
 
     u = tan_fast_kernel(s)
 
     u = muladd(s, u * x, x)
 
-    qli & 1 != 0 && (u = T(1.0) / u)
+    u = vifelse(qli_odd, inv(u), u)
 
-    !isinf(d) && (isnegzero(d) || abs(d) > TRIG_MAX(T)) && (u = T(-0.0))
+    u = vifelse(
+        (!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))),
+        T(-0.0), u
+    )
 
     return u
 end
 
-function tan_fast(d::T) where {T<:Float32}
+function tan_fast(d::FloatType32)
+    T = eltype(d)
     q = round(d * (2 * T(M_1_PI)))
 
     x = d
-    
+
     x = muladd(q, -PI_A(T) * T(0.5), x)
     x = muladd(q, -PI_B(T) * T(0.5), x)
     x = muladd(q, -PI_C(T) * T(0.5), x)
@@ -580,16 +631,20 @@ function tan_fast(d::T) where {T<:Float32}
 
     s = x * x
 
-    qi = unsafe_trunc(Int, q)
-    qi & 1 != 0 && (x = -x)
+    qli = unsafe_trunc(EquivalentInteger(T), q)
+    qli_odd = qli & 1 != 0
+    x = vifelse(qli_odd, -x, x)
 
     u = tan_fast_kernel(s)
 
     u = muladd(s, u * x, x)
 
-    qi & 1 != 0 && (u = T(1.0) / u)
+    u = vifelse(qli_odd, inv(u), u)
 
-    !isinf(d) && (isnegzero(d) || abs(d) > TRIG_MAX(T)) && (u = T(-0.0))
+    u = vifelse(
+        (!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))),
+        T(-0.0), u
+    )
 
     return u
 end
@@ -614,7 +669,7 @@ end
     return dadd(c1, x.hi * (@horner x.hi c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15))
 end
 
-@inline function tan_kernel(x::Double{Float32})
+@inline function tan_kernel(x::Double{<:FloatType32})
     c7 =  0.00446636462584137916564941f0
     c6 = -8.3920182078145444393158f-05
     c5 =  0.0109639242291450500488281f0
@@ -625,7 +680,8 @@ end
     return dadd(c1,  x.hi * (@horner x.hi c2 c3 c4 c5 c6 c7))
 end
 
-function tan(d::T) where {T<:Float64}
+function tan(d::FloatType64)
+    T = eltype(d)
     qh = trunc(d * (T(M_2_PI)) / (1 << 24))
     s = dadd2(dmul(Double(T(M_2_PI_H), T(M_2_PI_L)), d), (d < 0 ? T(-0.5) : T(0.5)) - qh * (1 << 24))
     ql = trunc(T(s))
@@ -638,8 +694,9 @@ function tan(d::T) where {T<:Float64}
     s = dadd2(s, ql * (-PI_C(T) * T(0.5)            ))
     s = dadd2(s, (qh * (1 << 24) + ql) * (-PI_D(T) * T(0.5)))
 
-    qli = unsafe_trunc(Int, ql)
-    qli & 1 != 0 && (s = -s)
+    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli_odd = qli & 1 != 0
+    s = vifelse(qli_odd, -s, s)
 
     t = s
     s = dsqu(s)
@@ -649,16 +706,20 @@ function tan(d::T) where {T<:Float64}
     x = dadd(T(1.0), dmul(u, s))
     x = dmul(t, x)
 
-    qli & 1 != 0 && (x = drec(x))
+    x = vifelse(qli_odd, drec(x), x)
 
     v = T(x)
-    
-    !isinf(d) && (isnegzero(d) || abs(d) > TRIG_MAX(T)) && (v = T(-0.0))
+
+    v = vifelse(
+        (!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))),
+        T(-0.0), v
+    )
 
     return v
 end
 
-function tan(d::T) where {T<:Float32}
+function tan(d::FloatType32)
+    T = eltype(d)
     q = round(d * (T(M_2_PI)))
 
     s = dadd2(d, q * -PI_A(T) * T(0.5))
@@ -667,8 +728,9 @@ function tan(d::T) where {T<:Float32}
     s = dadd2(s, q * -PI_XD(T) * T(0.5))
     s = dadd2(s, q * -PI_XE(T) * T(0.5))
 
-    qi = unsafe_trunc(Int, q)
-    qi & 1 != 0 && (s = -s)
+    qli = unsafe_trunc(EquivalentInteger(T), q)
+    qli_odd = qli & 1 != 0
+    s = vifelse(qli_odd, -s, s)
 
     t = s
     s = dsqu(s)
@@ -679,11 +741,14 @@ function tan(d::T) where {T<:Float32}
     x = dadd(T(1.0), dmul(u, s))
     x = dmul(t, x)
 
-    qi & 1 != 0 && (x = drec(x))
+    x = vifelse(qli_odd, drec(x), x)
 
     v = T(x)
-    
-    !isinf(d) && (isnegzero(d) || abs(d) > TRIG_MAX(T)) && (v = T(-0.0))
+
+    v = vifelse(
+        (!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))),
+        T(-0.0), v
+    )
 
     return v
 end
@@ -694,9 +759,9 @@ end
 
 Compute the inverse tangent of `x`, where the output is in radians.
 """
-function atan(x::T) where {T<:Union{Float32,Float64}}
+function atan(x::T) where {T<:FloatType}
     u = T(atan2k(Double(abs(x)), Double(T(1))))
-    isinf(x) && (u = T(PI_2))
+    u = vifelse(isinf(x), T(PI_2), u)
     flipsign(u, x)
 end
 
@@ -724,7 +789,7 @@ end
     return @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16 c17 c18 c19
 end
 
-@inline function atan_fast_kernel(x::Float32)
+@inline function atan_fast_kernel(x::FloatType32)
     c8 =  0.00282363896258175373077393f0
     c7 = -0.0159569028764963150024414f0
     c6 =  0.0425049886107444763183594f0
@@ -741,21 +806,22 @@ end
 
 Compute the inverse tangent of `x`, where the output is in radians.
 """
-function atan_fast(x::T) where {T<:Union{Float32,Float64}}
-    q = 0
-    if signbit(x)
-        x = -x
-        q = 2
-    end
-    if x > 1
-        x = 1 / x
-        q |= 1
-    end
+function atan_fast(x::T) where {T<:FloatType}
+    TI = EquivalentInteger(T)
+
+    q = vifelse(signbit(x), TI(2), TI(0))
+    x = abs(x)
+
+    xg1 = x > 1
+    x = vifelse(xg1, 1 / x, x)
+    q = vifelse(xg1, q | 1, q)
+
     t = x * x
     u = atan_fast_kernel(t)
-    t = x + x * t * u
-    q & 1 != 0 && (t = T(PI_2) - t)
-    q & 2 != 0 && (t = -t)
+    # t = x + x * t * u
+    t = muladd(x * t, u, x)
+    t = vifelse(q & 1 != 0, T(PI_2) - t, t)
+    t = vifelse(q & 2 != 0, -t, t)
     return t
 end
 
@@ -768,21 +834,21 @@ const under_atan2(::Type{Float32}) = 2.9387372783541830947f-39
 
 Compute the inverse tangent of `x/y`, using the signs of both `x` and `y` to determine the quadrant of the return value.
 """
-function atan(x::T, y::T) where {T<:Union{Float32,Float64}}
-    abs(y) < under_atan2(T) && (x *= T(Int64(1) << 53); y *= T(Int64(1) << 53))
+function atan(x::T, y::T) where {T<:FloatType}
+    absy_under_atan = abs(y) < under_atan2(eltype(y))
+    x = vifelse(absy_under_atan, x * T(Int64(1) << 53), x)
+    y = vifelse(absy_under_atan, y * T(Int64(1) << 53), y)
+
     r = T(atan2k(Double(abs(x)), Double(y)))
 
     r = flipsign(r, y)
-    if isinf(y) || y == 0
-        r = T(PI_2) - (isinf(y) ? _sign(y) * T(PI_2) : T(0.0))
-    end
-    if isinf(x)
-        r = T(PI_2) - (isinf(y) ? _sign(y) * T(PI_4) : T(0.0))
-    end
-    if x == 0
-        r = _sign(y) == -1 ? T(M_PI) : T(0.0)
-    end
-    return isnan(y) || isnan(x) ? T(NaN) : flipsign(r, x)
+    isinfy = isinf(y)
+    r = vifelse(isinfy, T(PI_2) - _sign(y) * T(PI_2), r)
+    r = vifelse(y == 0, T(PI_2), r)
+    r = vifelse(isinf(x), vifelse(isinfy, T(PI_2) - _sign(y) * T(PI_4), T(PI_2)), r)
+    r = vifelse(x == 0, vifelse(_sign(y) == -1, T(M_PI), T(0.0)), r)
+
+    return vifelse(isnan(y) | isnan(x), T(NaN), flipsign(r,x))
 end
 
 
@@ -791,19 +857,16 @@ end
 
 Compute the inverse tangent of `x/y`, using the signs of both `x` and `y` to determine the quadrant of the return value.
 """
-function atan_fast(x::T, y::T) where {T<:Union{Float32,Float64}}
+function atan_fast(x::T, y::T) where {T<:FloatType}
     r = atan2k_fast(abs(x), y)
     r = flipsign(r, y)
-    if isinf(y) || y == 0
-        r = T(PI_2) - (isinf(y) ? _sign(y) * T(PI_2) : T(0))
-    end
-    if isinf(x)
-        r = T(PI_2) - (isinf(y) ? _sign(y) * T(PI_4) : T(0))
-    end
-    if x == 0
-        r = _sign(y) == -1 ? T(M_PI) : T(0)
-    end
-    return isnan(y) || isnan(x) ? T(NaN) : flipsign(r, x)
+    r = vifelse(y == 0, T(PI_2), r)
+    infy = isinf(y)
+    r = vifelse(infy, T(PI_2) - _sign(y) * T(PI_2), r)
+    r = vifelse(isinf(x), vifelse(infy, T(PI_2) - _sign(y) * T(PI_4), T(PI_2)), r)
+    r = vifelse(x == 0, vifelse(_sign(y) == -1, T(M_PI), T(0)), r)
+
+    return vifelse(isnan(y) | isnan(x), T(NaN), flipsign(r, x))
 end
 
 
@@ -813,10 +876,10 @@ end
 
 Compute the inverse sine of `x`, where the output is in radians.
 """
-function asin(x::T) where {T<:Union{Float32,Float64}}
+function asin(x::T) where {T<:FloatType}
     d = atan2k(Double(abs(x)), dsqrt(dmul(dadd(T(1), x), dsub(T(1), x))))
     u = T(d)
-    abs(x) == 1 && (u = T(PI_2))
+    u = vifelse(abs(x) == 1, T(PI_2), u)
     flipsign(u, x)
 end
 
@@ -826,7 +889,7 @@ end
 
 Compute the inverse sine of `x`, where the output is in radians.
 """
-function asin_fast(x::T) where {T<:Union{Float32,Float64}}
+function asin_fast(x::T) where {T<:FloatType}
     flipsign(atan2k_fast(abs(x), _sqrt((1 + x) * (1 - x))), x)
 end
 
@@ -837,11 +900,11 @@ end
 
 Compute the inverse cosine of `x`, where the output is in radians.
 """
-function acos(x::T) where {T<:Union{Float32,Float64}}
+function acos(x::T) where {T<:FloatType}
     d = atan2k(dsqrt(dmul(dadd(T(1), x), dsub(T(1), x))), Double(abs(x)))
     d = flipsign(d, x)
-    abs(x) == 1 && (d = Double(T(0)))
-    signbit(x) && (d = dadd(MDPI(T), d))
+    d = vifelse(abs(x) == 1, Double(T(0)), d)
+    d = vifelse(signbit(x), dadd(MDPI(T), d), d)
     return T(d)
 end
 
@@ -852,5 +915,5 @@ end
 Compute the inverse cosine of `x`, where the output is in radians.
 """
 function acos_fast(x::T) where {T<:Union{Float32,Float64}}
-    flipsign(atan2k_fast(_sqrt((1 + x) * (1 - x)), abs(x)), x) + (signbit(x) ? T(M_PI) : T(0))
+    flipsign(atan2k_fast(_sqrt((1 + x) * (1 - x)), abs(x)), x) + vifelse(signbit(x), T(M_PI), T(0))
 end
diff --git a/src/utils.jl b/src/utils.jl
index 3e55f2d..df339f8 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -6,25 +6,52 @@ for T in (Float32, Float64)
 end
 const FMA_FAST = is_fma_fast(Float64) && is_fma_fast(Float32)
 
-@inline isnegzero(x::T) where {T<:AbstractFloat} = x === T(-0.0)
-
-@inline ispinf(x::T) where {T<:AbstractFloat} = x == T(Inf)
-@inline isninf(x::T) where {T<:AbstractFloat} = x == T(-Inf)
+@inline isnegzero(x::T) where {T<:Union{Float32,Float64}} = x === T(-0.0)
+# Disabling the check for performance when vecterized.
+# My initial attempt at vectorizing the function failed.
+@inline isnegzero(x::Vec{N}) where N = Vec{N,Bool}(false)
+# @inline function isnegzero(x::Vec{N,T}) where {N,T<:Union{Float32,Float64}}
+#     Vec{N,Bool}(ntuple(Val(N)) do n
+#         x[n] === T(-0.0)
+#     end)
+# end
+
+@inline ispinf(x::T) where {T<:FloatType} = x == T(Inf)
+@inline isninf(x::T) where {T<:FloatType} = x == T(-Inf)
 
 # _sign emits better native code than sign but does not properly handle the Inf/NaN cases
-@inline _sign(d::T) where {T<:AbstractFloat} = flipsign(one(T), d)
+@inline _sign(d::T) where {T<:FloatType} = flipsign(one(T), d)
 
 @inline integer2float(::Type{Float64}, m::Int) = reinterpret(Float64, (m % Int64) << significand_bits(Float64))
 @inline integer2float(::Type{Float32}, m::Int) = reinterpret(Float32, (m % Int32) << significand_bits(Float32))
 
+@inline function integer2float(::Type{<:Union{Vec{N,Float64},Float64}}, m::Vec{N,Int}) where N
+    reinterpret(Vec{N,Float64}, Vec{N,Int64}(ntuple(Val(N)) do n
+        Core.VecElement(m[n] % Int64)
+    end) << significand_bits(Float64))
+end
+@inline function integer2float(::Type{<:Union{Vec{N,Float32},Float32}}, m::Vec{N,Int32}) where N
+    reinterpret(Vec{N,Float32}, Vec{N,Int32}(ntuple(Val(N)) do n
+        Core.VecElement(m[n] % Int32)
+    end) << Int32(significand_bits(Float32)))
+end
+
 @inline float2integer(d::Float64) = (reinterpret(Int64, d) >> significand_bits(Float64)) % Int
 @inline float2integer(d::Float32) = (reinterpret(Int32, d) >> significand_bits(Float32)) % Int
 
-@inline pow2i(::Type{T}, q::Int) where {T<:Union{Float32,Float64}} = integer2float(T, q + exponent_bias(T))
+@inline function float2integer(d::Vec{N,Float64}) where N
+    Vec{N,Int64}(ntuple(Val(N)) do n
+        Core.VecElement((reinterpret(Int64, d[n]) >> significand_bits(Float64)) % Int)
+    end)
+end
+@inline function float2integer(d::Vec{N,Float32}) where N
+    Vec{N,Int32}(ntuple(Val(N)) do n
+        Core.VecElement((reinterpret(Int32, d[n]) >> Int32(significand_bits(Float32))) % Int32)
+    end)
+end
+
+@inline pow2i(::Type{T}, q::I) where {T<:Union{Float32,Float64},I<:IntegerType} = integer2float(T, q + exponent_bias(T))
 
 # sqrt without the domain checks which we don't need since we handle the checks ourselves
-if VERSION < v"0.7-"
-    _sqrt(x::T) where {T<:Union{Float32,Float64}} = Base.sqrt_llvm_fast(x)
-else
-    _sqrt(x::T) where {T<:Union{Float32,Float64}} = Base.sqrt_llvm(x)
-end
+@inline _sqrt(x::T) where {T<:Union{Float32,Float64}} = Base.sqrt_llvm(x)
+@inline _sqrt(x::Vec) = sqrt(x)

From 8b83a5ac5c51cd3da68625f0de93640d29783b9e Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Tue, 25 Dec 2018 02:41:11 -0600
Subject: [PATCH 2/7] Bug fixes to the vectorized versions of a few hyp and
 trig functions.

---
 src/hyp.jl  | 22 +++++++++++-----------
 src/priv.jl | 14 +++++++++-----
 src/trig.jl | 19 ++++++++++---------
 3 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/src/hyp.jl b/src/hyp.jl
index 84f4a4f..49a6cb5 100644
--- a/src/hyp.jl
+++ b/src/hyp.jl
@@ -8,12 +8,12 @@ over_sch(::Type{Float32}) = 89f0
 
 Compute hyperbolic sine of `x`.
 """
-function sinh(x::FloatType)
+function sinh(x::V) where V <: FloatType
     T = eltype(x)
     u = abs(x)
     d = expk2(Double(u))
     d = dsub(d, drec(d))
-    u = T(d) * T(0.5)
+    u = V(d) * T(0.5)
     u = vifelse(abs(x) > over_sch(T), T(Inf), u)
     u = vifelse(isnan(u), T(Inf), u)
     u = flipsign(u, x)
@@ -28,12 +28,12 @@ end
 
 Compute hyperbolic cosine of `x`.
 """
-function cosh(x::FloatType)
+function cosh(x::V) where V <: FloatType
     T = eltype(x)
     u = abs(x)
     d = expk2(Double(u))
     d = dadd(d, drec(d))
-    u = T(d) * T(0.5)
+    u = V(d) * T(0.5)
     u = vifelse(abs(x) > over_sch(T), T(Inf), u)
     u = vifelse(isnan(u), T(Inf), u)
     u = vifelse(isnan(x), T(NaN), u)
@@ -50,13 +50,13 @@ over_th(::Type{Float32}) = 18.714973875f0
 
 Compute hyperbolic tangent of `x`.
 """
-function tanh(x::FloatType)
+function tanh(x::V) where V <: FloatType
     T = eltype(x)
     u = abs(x)
     d = expk2(Double(u))
     e = drec(d)
     d = ddiv(dsub(d, e), dadd(d, e))
-    u = T(d)
+    u = V(d)
     u = vifelse(abs(x) > over_th(T), T(1.0), u)
     u = vifelse(isnan(u), T(1), u)
     u = flipsign(u, x)
@@ -81,7 +81,7 @@ function asinh(x::V) where V <: FloatType
     d = vifelse(yg1, dmul(d, y), d)
 
     d = logk2(dnormalize(dadd(d, x)))
-    y = T(d)
+    y = V(d)
 
     y = vifelse(((abs(x) > SQRT_MAX(T)) | isnan(y)), flipsign(T(Inf), x), y)
     y = vifelse(isnan(x), T(NaN), y)
@@ -97,10 +97,10 @@ end
 
 Compute the inverse hyperbolic cosine of `x`.
 """
-function acosh(x::FloatType)
+function acosh(x::V) where V <: FloatType
     T = eltype(x)
     d = logk2(dadd2(dmul(dsqrt(dadd2(x, T(1.0))), dsqrt(dsub2(x, T(1.0)))), x))
-    y = T(d)
+    y = V(d)
 
     y = vifelse(((x > SQRT_MAX(T)) | isnan(y)), T(Inf), y)
     y = vifelse(x == T(1.0), T(0.0), y)
@@ -117,11 +117,11 @@ end
 
 Compute the inverse hyperbolic tangent of `x`.
 """
-function atanh(x::FloatType)
+function atanh(x::V) where V <: FloatType
     T = eltype(x)
     u = abs(x)
     d = logk2(ddiv(dadd2(T(1.0), u), dsub2(T(1.0), u)))
-    u = vifelse(u > T(1.0), T(NaN), vifelse(u == T(1.0), T(Inf), T(d) * T(0.5)))
+    u = vifelse(u > T(1.0), T(NaN), vifelse(u == T(1.0), T(Inf), V(d) * T(0.5)))
 
     u = vifelse(isinf(x) | isnan(u), T(NaN), u)
     u = flipsign(u, x)
diff --git a/src/priv.jl b/src/priv.jl
index 78af53f..8641641 100644
--- a/src/priv.jl
+++ b/src/priv.jl
@@ -150,7 +150,11 @@ end
 
 @inline function atan2k(y::Double{V}, x::Double{V}) where {T,V<:Union{T,Vec{<:Any,T}}}
     xl0 = x < 0
-    q = vifelse(xl0, -2, 0)
+    if V <: Vec
+        q = vifelse(xl0, Vec{length(x.hi),EquivalentInteger(T)}(-2), 0)
+    else
+        q = vifelse(xl0, -2, 0)
+    end
     x = vifelse(xl0, -x, x)
     ygx = y > x
     t = x
@@ -167,7 +171,7 @@ end
     t = dmul(t, u)
     t = dmul(s, dadd(T(1.0), t))
     T <: Float64 && (t = vifelse(abs(s.hi) < 1e-200, s, t))
-    t = dadd(dmul(V(q), MDPI2(T)), t)
+    t = dadd(dmul(convert(V,q), MDPI2(T)), t)
     return t
 end
 
@@ -247,8 +251,8 @@ end
 end
 
 @inline function expk2(d::Double{V}) where {T<:Union{Float32,Float64},V<:Union{T,Vec{<:Any,T}}}
-    q = round(T(d) * T(MLN2E))
-    qi = unsafe_trunc(Int, q)
+    q = round(V(d) * T(MLN2E))
+    qi = unsafe_trunc(EquivalentInteger(T), q)
 
     s = dadd(d, -q * L2U(T))
     s = dadd(s, -q * L2L(T))
@@ -295,7 +299,7 @@ end
 
     t  = logk2_kernel(x2.hi)
 
-    s = dmul(MDLN2(T), T(e))
+    s = dmul(MDLN2(T), convert(V,e))
     s = dadd(s, scale(x, T(2.0)))
     s = dadd(s, dmul(dmul(x2, x), t))
     return s
diff --git a/src/trig.jl b/src/trig.jl
index 8a4b223..adc265d 100644
--- a/src/trig.jl
+++ b/src/trig.jl
@@ -650,7 +650,7 @@ function tan_fast(d::FloatType32)
 end
 
 
-@inline function tan_kernel(x::Double{Float64})
+@inline function tan_kernel(x::Double{<:FloatType64})
     c15 = 1.01419718511083373224408e-05
     c14 = -2.59519791585924697698614e-05
     c13 = 5.23388081915899855325186e-05
@@ -680,11 +680,11 @@ end
     return dadd(c1,  x.hi * (@horner x.hi c2 c3 c4 c5 c6 c7))
 end
 
-function tan(d::FloatType64)
+function tan(d::V) where V <: FloatType64
     T = eltype(d)
     qh = trunc(d * (T(M_2_PI)) / (1 << 24))
-    s = dadd2(dmul(Double(T(M_2_PI_H), T(M_2_PI_L)), d), (d < 0 ? T(-0.5) : T(0.5)) - qh * (1 << 24))
-    ql = trunc(T(s))
+    s = dadd2(dmul(Double(T(M_2_PI_H), T(M_2_PI_L)), d), vifelse(d < 0, V(-0.5), V(0.5)) - qh * (1 << 24))
+    ql = trunc(V(s))
 
     s = dadd2(d, qh * (-PI_A(T) * T(0.5) * (1 << 24)))
     s = dadd2(s, ql * (-PI_A(T) * T(0.5)            ))
@@ -708,7 +708,7 @@ function tan(d::FloatType64)
 
     x = vifelse(qli_odd, drec(x), x)
 
-    v = T(x)
+    v = V(x)
 
     v = vifelse(
         (!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))),
@@ -718,7 +718,7 @@ function tan(d::FloatType64)
     return v
 end
 
-function tan(d::FloatType32)
+function tan(d::V) where V <: FloatType32
     T = eltype(d)
     q = round(d * (T(M_2_PI)))
 
@@ -743,7 +743,7 @@ function tan(d::FloatType32)
 
     x = vifelse(qli_odd, drec(x), x)
 
-    v = T(x)
+    v = V(x)
 
     v = vifelse(
         (!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))),
@@ -900,12 +900,13 @@ end
 
 Compute the inverse cosine of `x`, where the output is in radians.
 """
-function acos(x::T) where {T<:FloatType}
+function acos(x::V) where {V<:FloatType}
+    T = eltype(x)
     d = atan2k(dsqrt(dmul(dadd(T(1), x), dsub(T(1), x))), Double(abs(x)))
     d = flipsign(d, x)
     d = vifelse(abs(x) == 1, Double(T(0)), d)
     d = vifelse(signbit(x), dadd(MDPI(T), d), d)
-    return T(d)
+    return V(d)
 end
 
 

From faac78d895c033ffee6eb64088b0773d95715179 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Tue, 25 Dec 2018 03:13:33 -0600
Subject: [PATCH 3/7] Attempt to fix code for 32 bit Windows.

---
 src/priv.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/priv.jl b/src/priv.jl
index 8641641..5d0ffdf 100644
--- a/src/priv.jl
+++ b/src/priv.jl
@@ -51,7 +51,7 @@ end
     x * pow2i(eltype(x), e >> 1) * pow2i(eltype(x), e - (e >> 1))
 end
 
-@inline function ldexp3k(x::T, e::Int) where {T<:Union{Float32,Float64}}
+@inline function ldexp3k(x::T, e::Integer) where {T<:Union{Float32,Float64}}
     reinterpret(T, reinterpret(Unsigned, x) + (Int64(e) << significand_bits(T)) % uinttype(T))
 end
 @generated function ldexp3k(x::Vec{N,T}, e::IntegerType) where {N,T<:Union{Float32,Float64}}
@@ -89,7 +89,7 @@ end
 # similar to ilogbk, but argument has to be a normalized float value
 @inline function ilogb2k(d::FloatType)
     T = eltype(d)
-    I = EquivalentInteger(T)
+    I = Int == Int32 ? Int32 : EquivalentInteger(T)
     (float2integer(d) & I(exponent_raw_max(T))) - I(exponent_bias(T))
 end
 

From 3f04bcc3b3262ec2bd1dcdda36e9d7930ef0c6a0 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Tue, 25 Dec 2018 03:28:16 -0600
Subject: [PATCH 4/7] Second attempt at fixing for 64bit Windows, via defining
 EquivalentInteger(::Type{T}) to always defaut to 32 bit integers on 32 bit
 systems.

---
 src/SLEEF.jl | 4 ++--
 src/priv.jl  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/SLEEF.jl b/src/SLEEF.jl
index 0065690..bc531ac 100644
--- a/src/SLEEF.jl
+++ b/src/SLEEF.jl
@@ -18,9 +18,9 @@ const IntegerType64 = Union{Int64,Vec{<:Any,Int64}}
 const IntegerType32 = Union{Int32,Vec{<:Any,Int32}}
 const IntegerType = Union{IntegerType64,IntegerType32}
 
-EquivalentInteger(::Type{Float64}) = Int64
+EquivalentInteger(::Type{Float64}) = Int == Int32 ? Int32 : Int64
 EquivalentInteger(::Type{Float32}) = Int32
-EquivalentInteger(::Type{Vec{N,Float64}}) where N = Vec{N,Int64}
+EquivalentInteger(::Type{Vec{N,Float64}}) where N = Int == Int32 ? Vec{N,Int32} : Vec{N,Int64}
 EquivalentInteger(::Type{Vec{N,Float32}}) where N = Vec{N,Int32}
 
 @generated function Base.unsafe_trunc(::Type{I}, x::Vec{N,T}) where {N,T,I}
diff --git a/src/priv.jl b/src/priv.jl
index 5d0ffdf..a4f29b0 100644
--- a/src/priv.jl
+++ b/src/priv.jl
@@ -89,7 +89,7 @@ end
 # similar to ilogbk, but argument has to be a normalized float value
 @inline function ilogb2k(d::FloatType)
     T = eltype(d)
-    I = Int == Int32 ? Int32 : EquivalentInteger(T)
+    I = EquivalentInteger(T)
     (float2integer(d) & I(exponent_raw_max(T))) - I(exponent_bias(T))
 end
 

From 3aee0ef83615d5ae660541d4b4743202d7d70232 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Tue, 25 Dec 2018 13:28:52 -0600
Subject: [PATCH 5/7] Made changes reflecting comments: restored a let block,
 reverted muladd changes, and removed a commented out function.

---
 Manifest.toml | 39 -------------------------
 src/exp.jl    |  2 +-
 src/log.jl    |  3 +-
 src/priv.jl   | 79 +++++++++++++++++++++++++++------------------------
 src/trig.jl   |  6 ++--
 src/utils.jl  |  7 +----
 6 files changed, 48 insertions(+), 88 deletions(-)
 delete mode 100644 Manifest.toml

diff --git a/Manifest.toml b/Manifest.toml
deleted file mode 100644
index ee50dd2..0000000
--- a/Manifest.toml
+++ /dev/null
@@ -1,39 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-[[Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[Distributed]]
-deps = ["Random", "Serialization", "Sockets"]
-uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
-
-[[InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[Random]]
-deps = ["Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[SIMD]]
-deps = ["InteractiveUtils", "Test"]
-git-tree-sha1 = "c6f6f4a8103e69c66b06def75f04904f72111c51"
-uuid = "fdea26ae-647d-5447-a871-4b548cad5224"
-version = "2.2.0"
-
-[[Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[Test]]
-deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
-uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/exp.jl b/src/exp.jl
index 4870c59..739660d 100644
--- a/src/exp.jl
+++ b/src/exp.jl
@@ -184,7 +184,7 @@ function exp(d::FloatType)
     s = muladd(q, -L2L(T), s)
 
     u = exp_kernel(s)
-    u = muladd(s * s, u, s) + 1
+    u = s * s * u + s + 1
     u = ldexp2k(u, qi)
 
     u = vifelse(d > max_exp(T), T(Inf), u)
diff --git a/src/log.jl b/src/log.jl
index 24b255c..87d4b34 100644
--- a/src/log.jl
+++ b/src/log.jl
@@ -196,8 +196,7 @@ function log_fast(d::FloatType)
 
     t = log_fast_kernel(x2)
 
-    x = muladd(x, t, T(MLN2) * e)
-
+    x = x * t + T(MLN2) * e
 
     x = vifelse(isinf(d), T(Inf), x)
     x = vifelse((d < 0) | isnan(d), T(NaN), x)
diff --git a/src/priv.jl b/src/priv.jl
index a4f29b0..f51793a 100644
--- a/src/priv.jl
+++ b/src/priv.jl
@@ -93,39 +93,43 @@ end
     (float2integer(d) & I(exponent_raw_max(T))) - I(exponent_bias(T))
 end
 
-const c20d =  1.06298484191448746607415e-05
-const c19d = -0.000125620649967286867384336
-const c18d =  0.00070557664296393412389774
-const c17d = -0.00251865614498713360352999
-const c16d =  0.00646262899036991172313504
-const c15d = -0.0128281333663399031014274
-const c14d =  0.0208024799924145797902497
-const c13d = -0.0289002344784740315686289
-const c12d =  0.0359785005035104590853656
-const c11d = -0.041848579703592507506027
-const c10d =  0.0470843011653283988193763
-const c9d  = -0.0524914210588448421068719
-const c8d  =  0.0587946590969581003860434
-const c7d  = -0.0666620884778795497194182
-const c6d  =  0.0769225330296203768654095
-const c5d  = -0.0909090442773387574781907
-const c4d  =  0.111111108376896236538123
-const c3d  = -0.142857142756268568062339
-const c2d  =  0.199999999997977351284817
-const c1d =  -0.333333333333317605173818
-
-const c9f = -0.00176397908944636583328247f0
-const c8f =  0.0107900900766253471374512f0
-const c7f = -0.0309564601629972457885742f0
-const c6f =  0.0577365085482597351074219f0
-const c5f = -0.0838950723409652709960938f0
-const c4f =  0.109463557600975036621094f0
-const c3f = -0.142626821994781494140625f0
-const c2f =  0.199983194470405578613281f0
-const c1f = -0.333332866430282592773438f0
-
-@inline atan2k_fast_kernel(x::FloatType64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
-@inline atan2k_fast_kernel(x::FloatType32) = @horner x c1f c2f c3f c4f c5f c6f c7f c8f c9f
+let
+global atan2k_fast
+global atan2k
+
+c20d =  1.06298484191448746607415e-05
+c19d = -0.000125620649967286867384336
+c18d =  0.00070557664296393412389774
+c17d = -0.00251865614498713360352999
+c16d =  0.00646262899036991172313504
+c15d = -0.0128281333663399031014274
+c14d =  0.0208024799924145797902497
+c13d = -0.0289002344784740315686289
+c12d =  0.0359785005035104590853656
+c11d = -0.041848579703592507506027
+c10d =  0.0470843011653283988193763
+c9d  = -0.0524914210588448421068719
+c8d  =  0.0587946590969581003860434
+c7d  = -0.0666620884778795497194182
+c6d  =  0.0769225330296203768654095
+c5d  = -0.0909090442773387574781907
+c4d  =  0.111111108376896236538123
+c3d  = -0.142857142756268568062339
+c2d  =  0.199999999997977351284817
+c1d =  -0.333333333333317605173818
+
+c9f = -0.00176397908944636583328247f0
+c8f =  0.0107900900766253471374512f0
+c7f = -0.0309564601629972457885742f0
+c6f =  0.0577365085482597351074219f0
+c5f = -0.0838950723409652709960938f0
+c4f =  0.109463557600975036621094f0
+c3f = -0.142626821994781494140625f0
+c2f =  0.199983194470405578613281f0
+c1f = -0.333332866430282592773438f0
+
+global @inline atan2k_fast_kernel(x::FloatType64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
+global @inline atan2k_fast_kernel(x::FloatType32) = @horner x c1f c2f c3f c4f c5f c6f c7f c8f c9f
 
 @inline function atan2k_fast(y::V, x::V) where {T<:Union{Float32,Float64},V<:Union{T,Vec{<:Any,T}}}
     xl0 = x < 0
@@ -140,13 +144,13 @@ const c1f = -0.333332866430282592773438f0
     s = y / x
     t = s * s
     u = atan2k_fast_kernel(t)
-    t = muladd(u, t * s, s)
-    muladd(q, T(PI_2), t)
+    t = u * t * s + s
+    q * T(PI_2) + t
 end
 
 
-@inline atan2k_kernel(x::Double{<:FloatType64}) = @horner x.hi c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
-@inline atan2k_kernel(x::Double{<:FloatType32}) = dadd(c1f, x.hi * (@horner x.hi c2f c3f c4f c5f c6f c7f c8f c9f))
+global @inline atan2k_kernel(x::Double{<:FloatType64}) = @horner x.hi c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
+global @inline atan2k_kernel(x::Double{<:FloatType32}) = dadd(c1f, x.hi * (@horner x.hi c2f c3f c4f c5f c6f c7f c8f c9f))
 
 @inline function atan2k(y::Double{V}, x::Double{V}) where {T,V<:Union{T,Vec{<:Any,T}}}
     xl0 = x < 0
@@ -174,6 +178,7 @@ end
     t = dadd(dmul(convert(V,q), MDPI2(T)), t)
     return t
 end
+end
 
 
 
diff --git a/src/trig.jl b/src/trig.jl
index adc265d..8be7a47 100644
--- a/src/trig.jl
+++ b/src/trig.jl
@@ -376,7 +376,7 @@ function sincos_fast(d::FloatType64)
 
     u = sincos_b_kernel(s)
 
-    ry = muladd(u, s, T(1.0))
+    ry = u * s + T(1.0)
 
     qli = unsafe_trunc(EquivalentInteger(T), ql)
     qli_odd = qli & 1 != 0
@@ -420,7 +420,7 @@ function sincos_fast(d::FloatType32)
 
     u = sincos_b_kernel(s)
 
-    ry = muladd(u, s, T(1.0))
+    ry = u * s + T(1.0)
 
     qi = unsafe_trunc(EquivalentInteger(T), q)
     qi_isodd = qi & 1 != 0
@@ -819,7 +819,7 @@ function atan_fast(x::T) where {T<:FloatType}
     t = x * x
     u = atan_fast_kernel(t)
     # t = x + x * t * u
-    t = muladd(x * t, u, x)
+    t = x * t * u + x
     t = vifelse(q & 1 != 0, T(PI_2) - t, t)
     t = vifelse(q & 2 != 0, -t, t)
     return t
diff --git a/src/utils.jl b/src/utils.jl
index df339f8..a78349d 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -8,13 +8,8 @@ const FMA_FAST = is_fma_fast(Float64) && is_fma_fast(Float32)
 
 @inline isnegzero(x::T) where {T<:Union{Float32,Float64}} = x === T(-0.0)
 # Disabling the check for performance when vecterized.
-# My initial attempt at vectorizing the function failed.
+# A PR succesfully vectorizing the check is welcome.
 @inline isnegzero(x::Vec{N}) where N = Vec{N,Bool}(false)
-# @inline function isnegzero(x::Vec{N,T}) where {N,T<:Union{Float32,Float64}}
-#     Vec{N,Bool}(ntuple(Val(N)) do n
-#         x[n] === T(-0.0)
-#     end)
-# end
 
 @inline ispinf(x::T) where {T<:FloatType} = x == T(Inf)
 @inline isninf(x::T) where {T<:FloatType} = x == T(-Inf)

From 99ec7ecfc86030b37b7feeb7bab460df96fa2f43 Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Tue, 25 Dec 2018 13:32:59 -0600
Subject: [PATCH 6/7] Removed the rest of the added muladds.

---
 src/misc.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/misc.jl b/src/misc.jl
index 8e6c401..2a32e81 100644
--- a/src/misc.jl
+++ b/src/misc.jl
@@ -66,9 +66,9 @@ function cbrt_fast(d::V) where V <: FloatType
     x  = cbrt_kernel(d)
     y  = x * x
     y  = y * y
-    x  = muladd(T(-1/3), muladd(d, y, - x), x)
+    x -= (d * y - x) * T(1 / 3)
     y  = d * x * x
-    y  = (y - T(2 / 3) * y * muladd(y, x, T(- 1))) * q
+    y = (y - T(2 / 3) * y * (y * x - 1)) * q
 end
 
 
@@ -89,7 +89,7 @@ function cbrt(d::V) where V <: FloatType
     x  = cbrt_kernel(d)
     y  = x * x
     y  = y * y
-    x  = muladd(T(-1 / 3), muladd(d, y, - x), x)
+    x -= (d * y - x) * T(1 / 3)
     z  = x
     u  = dsqu(x)
     u  = dsqu(u)
@@ -121,5 +121,5 @@ function hypot(x::T, y::T) where {T<:vIEEEFloat}
     y = max(a,b)
 
     r = vifelse(x == 0, y, y / x)
-    x * _sqrt(muladd(r, r, T(1.0)))
+    x * sqrt(T(1.0) + r * r)
 end

From e57ed3c1891a5438078974a4f8fc01936a31200c Mon Sep 17 00:00:00 2001
From: Chris Elrod <elrodc@gmail.com>
Date: Tue, 25 Dec 2018 19:30:48 -0600
Subject: [PATCH 7/7] Style updates.

---
 src/SLEEF.jl  | 12 ++++++++----
 src/double.jl |  4 ++--
 src/exp.jl    | 25 +++++++++++++------------
 src/hyp.jl    | 12 ++++++------
 src/log.jl    |  4 ++--
 src/misc.jl   |  6 +++---
 src/priv.jl   |  8 ++++----
 src/trig.jl   | 34 +++++++++++++++++-----------------
 src/utils.jl  | 10 +++++-----
 9 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/src/SLEEF.jl b/src/SLEEF.jl
index bc531ac..4f200e5 100644
--- a/src/SLEEF.jl
+++ b/src/SLEEF.jl
@@ -18,10 +18,14 @@ const IntegerType64 = Union{Int64,Vec{<:Any,Int64}}
 const IntegerType32 = Union{Int32,Vec{<:Any,Int32}}
 const IntegerType = Union{IntegerType64,IntegerType32}
 
-EquivalentInteger(::Type{Float64}) = Int == Int32 ? Int32 : Int64
-EquivalentInteger(::Type{Float32}) = Int32
-EquivalentInteger(::Type{Vec{N,Float64}}) where N = Int == Int32 ? Vec{N,Int32} : Vec{N,Int64}
-EquivalentInteger(::Type{Vec{N,Float32}}) where N = Vec{N,Int32}
+fpinttype(::Type{Float64}) = Int == Int32 ? Int32 : Int64
+fpinttype(::Type{Float32}) = Int32
+function fpinttype(::Type{Vec{N,Float64}}) where {N}
+    Int == Int32 ? Vec{N,Int32} : Vec{N,Int64}
+end
+function fpinttype(::Type{Vec{N,Float32}}) where {N}
+    Vec{N,Int32}
+end
 
 @generated function Base.unsafe_trunc(::Type{I}, x::Vec{N,T}) where {N,T,I}
     quote
diff --git a/src/double.jl b/src/double.jl
index c376c77..ba3aef4 100644
--- a/src/double.jl
+++ b/src/double.jl
@@ -14,10 +14,10 @@ Double(x::T) where {T<:vIEEEFloat} = Double(x, zero(T))
 @inline trunclo(x::Float64) = reinterpret(Float64, reinterpret(UInt64, x) & 0xffff_ffff_f800_0000) # clear lower 27 bits (leave upper 26 bits)
 @inline trunclo(x::Float32) = reinterpret(Float32, reinterpret(UInt32, x) & 0xffff_f000) # clear lowest 12 bits (leave upper 12 bits)
 
-@inline function trunclo(x::Vec{N,Float64}) where N
+@inline function trunclo(x::Vec{N,Float64}) where {N}
     reinterpret(Vec{N,Float64}, reinterpret(Vec{N,UInt64}, x) & 0xffff_ffff_f800_0000) # clear lower 27 bits (leave upper 26 bits)
 end
-@inline function trunclo(x::Vec{N,Float32}) where N
+@inline function trunclo(x::Vec{N,Float32}) where {N}
     reinterpret(Vec{N,Float32}, reinterpret(Vec{N,UInt32}, x) & 0xffff_f000) # clear lowest 12 bits (leave upper 12 bits)
 end
 
diff --git a/src/exp.jl b/src/exp.jl
index 739660d..aee7228 100644
--- a/src/exp.jl
+++ b/src/exp.jl
@@ -26,7 +26,7 @@ const min_exp2(::Type{Float32}) = -150f0
     c3  = 0.5550410866482046596e-1
     c2  = 0.2402265069591012214
     c1  = 0.6931471805599452862
-    @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11
+    return @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11
 end
 
 @inline function exp2_kernel(x::FloatType32)
@@ -36,7 +36,7 @@ end
     c3 = 0.5550347269f-1
     c2 = 0.2402264476f0
     c1 = 0.6931471825f0
-    @horner x c1 c2 c3 c4 c5 c6
+    return @horner x c1 c2 c3 c4 c5 c6
 end
 
 """
@@ -44,10 +44,10 @@ end
 
 Compute the base-`2` exponential of `x`, that is `2ˣ`.
 """
-function exp2(d::V) where V <: FloatType
+function exp2(d::V) where {V <: FloatType}
     T = eltype(d)
     q = round(d)
-    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi = unsafe_trunc(fpinttype(T), q)
 
     s = d - q
 
@@ -58,7 +58,8 @@ function exp2(d::V) where V <: FloatType
 
     u = vifelse(d > max_exp2(T), T(Inf), u)
     u = vifelse(d < min_exp2(T), T(0.0), u)
-    u
+
+    return u
 end
 
 
@@ -98,10 +99,10 @@ end
 
 Compute the base-`10` exponential of `x`, that is `10ˣ`.
 """
-function exp10(d::V) where V <: FloatType
+function exp10(d::V) where {V <: FloatType}
     T = eltype(d)
     q = round(T(MLOG10_2) * d)
-    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi = unsafe_trunc(fpinttype(T), q)
 
     s = muladd(q, -L10U(T), d)
     s = muladd(q, -L10L(T), s)
@@ -114,7 +115,7 @@ function exp10(d::V) where V <: FloatType
     u = vifelse(d > max_exp10(T), T(Inf), u)
     u = vifelse(d < min_exp10(T), T(0.0), u)
 
-    u
+    return u
 end
 
 
@@ -157,7 +158,7 @@ const min_exp(::Type{<:FloatType32}) = -103.97208f0               # ≈ log 2^-1
     c3  = 0.0416666666666665047591422
     c2  = 0.166666666666666851703837
     c1  = 0.50
-    @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11
+    return @horner x c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11
 end
 
 @inline function exp_kernel(x::FloatType32)
@@ -167,7 +168,7 @@ end
     c3 = 0.0416664853692054748535156f0
     c2 = 0.166666671633720397949219f0
     c1 = 0.5f0
-    @horner x c1 c2 c3 c4 c5 c6
+    return @horner x c1 c2 c3 c4 c5 c6
 end
 
 """
@@ -178,7 +179,7 @@ Compute the base-`e` exponential of `x`, that is `eˣ`.
 function exp(d::FloatType)
     T = eltype(d)
     q = round(T(MLN2E) * d)
-    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi = unsafe_trunc(fpinttype(T), q)
 
     s = muladd(q, -L2U(T), d)
     s = muladd(q, -L2L(T), s)
@@ -190,5 +191,5 @@ function exp(d::FloatType)
     u = vifelse(d > max_exp(T), T(Inf), u)
     u = vifelse(d < min_exp(T), T(0), u)
 
-    u
+    return u
 end
diff --git a/src/hyp.jl b/src/hyp.jl
index 49a6cb5..397ac6d 100644
--- a/src/hyp.jl
+++ b/src/hyp.jl
@@ -8,7 +8,7 @@ over_sch(::Type{Float32}) = 89f0
 
 Compute hyperbolic sine of `x`.
 """
-function sinh(x::V) where V <: FloatType
+function sinh(x::V) where {V <: FloatType}
     T = eltype(x)
     u = abs(x)
     d = expk2(Double(u))
@@ -28,7 +28,7 @@ end
 
 Compute hyperbolic cosine of `x`.
 """
-function cosh(x::V) where V <: FloatType
+function cosh(x::V) where {V <: FloatType}
     T = eltype(x)
     u = abs(x)
     d = expk2(Double(u))
@@ -50,7 +50,7 @@ over_th(::Type{Float32}) = 18.714973875f0
 
 Compute hyperbolic tangent of `x`.
 """
-function tanh(x::V) where V <: FloatType
+function tanh(x::V) where {V <: FloatType}
     T = eltype(x)
     u = abs(x)
     d = expk2(Double(u))
@@ -71,7 +71,7 @@ end
 
 Compute the inverse hyperbolic sine of `x`.
 """
-function asinh(x::V) where V <: FloatType
+function asinh(x::V) where {V <: FloatType}
     T = eltype(x)
     y = abs(x)
 
@@ -97,7 +97,7 @@ end
 
 Compute the inverse hyperbolic cosine of `x`.
 """
-function acosh(x::V) where V <: FloatType
+function acosh(x::V) where {V <: FloatType}
     T = eltype(x)
     d = logk2(dadd2(dmul(dsqrt(dadd2(x, T(1.0))), dsqrt(dsub2(x, T(1.0)))), x))
     y = V(d)
@@ -117,7 +117,7 @@ end
 
 Compute the inverse hyperbolic tangent of `x`.
 """
-function atanh(x::V) where V <: FloatType
+function atanh(x::V) where {V <: FloatType}
     T = eltype(x)
     u = abs(x)
     d = logk2(ddiv(dadd2(T(1.0), u), dsub2(T(1.0), u)))
diff --git a/src/log.jl b/src/log.jl
index 87d4b34..515a5aa 100644
--- a/src/log.jl
+++ b/src/log.jl
@@ -34,7 +34,7 @@ end
 
 Returns the base `10` logarithm of `x`.
 """
-function log10(a::V) where V <: FloatType
+function log10(a::V) where {V <: FloatType}
     T = eltype(a)
     x = V(dmul(logk(a), MDLN10E(T)))
 
@@ -52,7 +52,7 @@ end
 
 Returns the base `2` logarithm of `x`.
 """
-function log2(a::V) where V <: FloatType
+function log2(a::V) where {V <: FloatType}
     T = eltype(a)
     u = V(dmul(logk(a), MDLN2E(T)))
 
diff --git a/src/misc.jl b/src/misc.jl
index 2a32e81..3ea5551 100644
--- a/src/misc.jl
+++ b/src/misc.jl
@@ -6,7 +6,7 @@ Exponentiation operator, returns `x` raised to the power `y`.
 """
 function pow(x::V, y::V) where {V <: FloatType}
     T = eltype(x)
-    yi = unsafe_trunc(EquivalentInteger(T), y)
+    yi = unsafe_trunc(fpinttype(T), y)
     yisint = yi == y
     yisodd = isodd(yi) & yisint
 
@@ -53,7 +53,7 @@ end
 
 Return `x^{1/3}`.
 """
-function cbrt_fast(d::V) where V <: FloatType
+function cbrt_fast(d::V) where {V <: FloatType}
     T  = eltype(d)
     e  = ilogbk(abs(d)) + 1
     d  = ldexp2k(d, -e)
@@ -77,7 +77,7 @@ end
 
 Return `x^{1/3}`. The prefix operator `∛` is equivalent to `cbrt`.
 """
-function cbrt(d::V) where V <: FloatType
+function cbrt(d::V) where {V <: FloatType}
     T  = eltype(d)
     e  = ilogbk(abs(d)) + 1
     d  = ldexp2k(d, -e)
diff --git a/src/priv.jl b/src/priv.jl
index f51793a..5472ee0 100644
--- a/src/priv.jl
+++ b/src/priv.jl
@@ -89,7 +89,7 @@ end
 # similar to ilogbk, but argument has to be a normalized float value
 @inline function ilogb2k(d::FloatType)
     T = eltype(d)
-    I = EquivalentInteger(T)
+    I = fpinttype(T)
     (float2integer(d) & I(exponent_raw_max(T))) - I(exponent_bias(T))
 end
 
@@ -155,7 +155,7 @@ global @inline atan2k_kernel(x::Double{<:FloatType32}) = dadd(c1f, x.hi * (@horn
 @inline function atan2k(y::Double{V}, x::Double{V}) where {T,V<:Union{T,Vec{<:Any,T}}}
     xl0 = x < 0
     if V <: Vec
-        q = vifelse(xl0, Vec{length(x.hi),EquivalentInteger(T)}(-2), 0)
+        q = vifelse(xl0, Vec{length(x.hi),fpinttype(T)}(-2), 0)
     else
         q = vifelse(xl0, -2, 0)
     end
@@ -210,7 +210,7 @@ end
 
 @inline function expk(d::Double{V}) where {T<:Union{Float32,Float64},V<:Union{T,Vec{<:Any,T}}}
     q = round(V(d) * V(MLN2E))
-    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi = unsafe_trunc(fpinttype(T), q)
 
     s = dadd(d, -q * L2U(T))
     s = dadd(s, -q * L2L(T))
@@ -257,7 +257,7 @@ end
 
 @inline function expk2(d::Double{V}) where {T<:Union{Float32,Float64},V<:Union{T,Vec{<:Any,T}}}
     q = round(V(d) * T(MLN2E))
-    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi = unsafe_trunc(fpinttype(T), q)
 
     s = dadd(d, -q * L2U(T))
     s = dadd(s, -q * L2L(T))
diff --git a/src/trig.jl b/src/trig.jl
index 8be7a47..4f48f21 100644
--- a/src/trig.jl
+++ b/src/trig.jl
@@ -55,7 +55,7 @@ function sin(d::V) where V <: FloatType64
     v = dmul(t, dadd(V(1.0), dmul(w, s)))
     u = V(v)
 
-    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli = unsafe_trunc(fpinttype(T), ql)
     u = vifelse(qli & 1 != 0, -u, u)
     u = vifelse((!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))), T(-0.0), u)
 
@@ -80,7 +80,7 @@ function sin(d::V) where V <: FloatType32
     v = dmul(t, dadd(T(1.0), dmul(w, s)))
     u = V(v)
 
-    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi = unsafe_trunc(fpinttype(T), q)
     u = vifelse(qi & 1 != 0, -u, u)
     u = vifelse((!isinf(d)) & (isnegzero(d) | (abs(d) > TRIG_MAX(T))), T(-0.0), u)
 
@@ -111,7 +111,7 @@ function cos(d::V) where V <: FloatType64
 
     u = V(v)
 
-    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli = unsafe_trunc(fpinttype(T), ql)
     u = vifelse(qli & 2 == 0, -u, u)
     u = vifelse(!isinf(d) & (d > TRIG_MAX(T)), T(0.0), u)
 
@@ -138,7 +138,7 @@ function cos(d::V) where V <: FloatType32
 
     u = V(v)
 
-    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi = unsafe_trunc(fpinttype(T), q)
     u = vifelse(qi & 2 == 0, -u, u)
     u = vifelse(!isinf(d) & (d > TRIG_MAX(T)), T(0.0), u)
 
@@ -204,7 +204,7 @@ function sin_fast(d::FloatType64)
 
     s = d * d
 
-    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli = unsafe_trunc(fpinttype(T), ql)
     d = vifelse(qli & 1 != 0, -d, d)
 
     u = sincos_fast_kernel(s)
@@ -228,7 +228,7 @@ function sin_fast(d::FloatType32)
 
     s = d * d
 
-    qli = unsafe_trunc(EquivalentInteger(T), q)
+    qli = unsafe_trunc(fpinttype(T), q)
     d = vifelse(qli & 1 != 0, -d, d)
 
     u = sincos_fast_kernel(s)
@@ -257,7 +257,7 @@ function cos_fast(d::FloatType64)
 
     s = d * d
 
-    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli = unsafe_trunc(fpinttype(T), ql)
     d = vifelse(qli & 2 == 0, -d, d)
 
     u = sincos_fast_kernel(s)
@@ -281,7 +281,7 @@ function cos_fast(d::FloatType32)
 
     s = d * d
 
-    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi = unsafe_trunc(fpinttype(T), q)
     d = vifelse(qi & 2 == 0, -d, d)
 
     u = sincos_fast_kernel(s)
@@ -378,7 +378,7 @@ function sincos_fast(d::FloatType64)
 
     ry = u * s + T(1.0)
 
-    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli = unsafe_trunc(fpinttype(T), ql)
     qli_odd = qli & 1 != 0
     s = vifelse(qli_odd, ry, s)
     ry = vifelse(qli_odd, rx, ry)
@@ -422,7 +422,7 @@ function sincos_fast(d::FloatType32)
 
     ry = u * s + T(1.0)
 
-    qi = unsafe_trunc(EquivalentInteger(T), q)
+    qi = unsafe_trunc(fpinttype(T), q)
     qi_isodd = qi & 1 != 0
     s = vifelse(qi_isodd, ry, s)
     ry = vifelse(qi_isodd, rx, ry)
@@ -472,7 +472,7 @@ function sincos(d::V) where V <: FloatType64
     v  = dadd(T(1.0), dmul(sx, u))
     ry = V(v)
 
-    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli = unsafe_trunc(fpinttype(T), ql)
     qli_odd = qli & 1 != 0
     u = vifelse(qli_odd, ry, u)
     ry = vifelse(qli_odd, rx, ry)
@@ -520,7 +520,7 @@ function sincos(d::V) where V <: FloatType32
     ry = V(v)
 
 
-    qli = unsafe_trunc(EquivalentInteger(T), q)
+    qli = unsafe_trunc(fpinttype(T), q)
     qli_odd = qli & 1 != 0
     u = vifelse(qli_odd, ry, u)
     ry = vifelse(qli_odd, rx, ry)
@@ -600,7 +600,7 @@ function tan_fast(d::FloatType64)
 
     s = x * x
 
-    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli = unsafe_trunc(fpinttype(T), ql)
     qli_odd = qli & 1 != 0
     x = vifelse(qli_odd, -x, x)
 
@@ -631,7 +631,7 @@ function tan_fast(d::FloatType32)
 
     s = x * x
 
-    qli = unsafe_trunc(EquivalentInteger(T), q)
+    qli = unsafe_trunc(fpinttype(T), q)
     qli_odd = qli & 1 != 0
     x = vifelse(qli_odd, -x, x)
 
@@ -694,7 +694,7 @@ function tan(d::V) where V <: FloatType64
     s = dadd2(s, ql * (-PI_C(T) * T(0.5)            ))
     s = dadd2(s, (qh * (1 << 24) + ql) * (-PI_D(T) * T(0.5)))
 
-    qli = unsafe_trunc(EquivalentInteger(T), ql)
+    qli = unsafe_trunc(fpinttype(T), ql)
     qli_odd = qli & 1 != 0
     s = vifelse(qli_odd, -s, s)
 
@@ -728,7 +728,7 @@ function tan(d::V) where V <: FloatType32
     s = dadd2(s, q * -PI_XD(T) * T(0.5))
     s = dadd2(s, q * -PI_XE(T) * T(0.5))
 
-    qli = unsafe_trunc(EquivalentInteger(T), q)
+    qli = unsafe_trunc(fpinttype(T), q)
     qli_odd = qli & 1 != 0
     s = vifelse(qli_odd, -s, s)
 
@@ -807,7 +807,7 @@ end
 Compute the inverse tangent of `x`, where the output is in radians.
 """
 function atan_fast(x::T) where {T<:FloatType}
-    TI = EquivalentInteger(T)
+    TI = fpinttype(T)
 
     q = vifelse(signbit(x), TI(2), TI(0))
     x = abs(x)
diff --git a/src/utils.jl b/src/utils.jl
index a78349d..1b346da 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -9,7 +9,7 @@ const FMA_FAST = is_fma_fast(Float64) && is_fma_fast(Float32)
 @inline isnegzero(x::T) where {T<:Union{Float32,Float64}} = x === T(-0.0)
 # Disabling the check for performance when vecterized.
 # A PR succesfully vectorizing the check is welcome.
-@inline isnegzero(x::Vec{N}) where N = Vec{N,Bool}(false)
+@inline isnegzero(x::Vec{N}) where {N} = Vec{N,Bool}(false)
 
 @inline ispinf(x::T) where {T<:FloatType} = x == T(Inf)
 @inline isninf(x::T) where {T<:FloatType} = x == T(-Inf)
@@ -20,12 +20,12 @@ const FMA_FAST = is_fma_fast(Float64) && is_fma_fast(Float32)
 @inline integer2float(::Type{Float64}, m::Int) = reinterpret(Float64, (m % Int64) << significand_bits(Float64))
 @inline integer2float(::Type{Float32}, m::Int) = reinterpret(Float32, (m % Int32) << significand_bits(Float32))
 
-@inline function integer2float(::Type{<:Union{Vec{N,Float64},Float64}}, m::Vec{N,Int}) where N
+@inline function integer2float(::Type{<:Union{Vec{N,Float64},Float64}}, m::Vec{N,Int}) where {N}
     reinterpret(Vec{N,Float64}, Vec{N,Int64}(ntuple(Val(N)) do n
         Core.VecElement(m[n] % Int64)
     end) << significand_bits(Float64))
 end
-@inline function integer2float(::Type{<:Union{Vec{N,Float32},Float32}}, m::Vec{N,Int32}) where N
+@inline function integer2float(::Type{<:Union{Vec{N,Float32},Float32}}, m::Vec{N,<:Integer}) where {N}
     reinterpret(Vec{N,Float32}, Vec{N,Int32}(ntuple(Val(N)) do n
         Core.VecElement(m[n] % Int32)
     end) << Int32(significand_bits(Float32)))
@@ -34,12 +34,12 @@ end
 @inline float2integer(d::Float64) = (reinterpret(Int64, d) >> significand_bits(Float64)) % Int
 @inline float2integer(d::Float32) = (reinterpret(Int32, d) >> significand_bits(Float32)) % Int
 
-@inline function float2integer(d::Vec{N,Float64}) where N
+@inline function float2integer(d::Vec{N,Float64}) where {N}
     Vec{N,Int64}(ntuple(Val(N)) do n
         Core.VecElement((reinterpret(Int64, d[n]) >> significand_bits(Float64)) % Int)
     end)
 end
-@inline function float2integer(d::Vec{N,Float32}) where N
+@inline function float2integer(d::Vec{N,Float32}) where {N}
     Vec{N,Int32}(ntuple(Val(N)) do n
         Core.VecElement((reinterpret(Int32, d[n]) >> Int32(significand_bits(Float32))) % Int32)
     end)