diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c143632
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.jl.cov
+*.jl.*.cov
+*.jl.mem
+
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..c7b3940
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,17 @@
+# Documentation: http://docs.travis-ci.com/user/languages/julia/
+language: julia
+os:
+  - linux
+  - osx
+julia:
+  - 0.6
+  - nightly
+notifications:
+  email: false
+# uncomment the following lines to override the default test script
+#script:
+#  - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
+ # - julia -e 'versioninfo(); Pkg.clone(pwd()); Pkg.build("Sleef"); Pkg.test("Sleef"; coverage=true)'
+after_success:
+  # push coverage results
+  - julia -e 'cd(Pkg.dir("Sleef")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder()); Codecov.submit(Codecov.process_folder())'
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..bac3d25
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,28 @@
+The Sleef.jl package is licensed under the MIT "Expat" License:
+
+> Copyright (c) 2016: Mustafa Mohamad and other contributors:
+> 
+> https://github.com/musm/Sleef.jl/graphs/contributors
+> 
+> Permission is hereby granted, free of charge, to any person obtaining a copy
+> of this software and associated documentation files (the "Software"), to deal
+> in the Software without restriction, including without limitation the rights
+> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+> copies of the Software, and to permit persons to whom the Software is
+> furnished to do so, subject to the following conditions:
+> 
+> The above copyright notice and this permission notice shall be included in all
+> copies or substantial portions of the Software.
+> 
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+> SOFTWARE.
+> 
+
+Sleef.jl includes ported code from the following project
+
+- [sleef](https://github.com/shibatch/sleef) [public domain] Author Naoki Shibata
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1e85d58
--- /dev/null
+++ b/README.md
@@ -0,0 +1,61 @@
+<div align="center"> <img
+src="https://rawgit.com/musm/Sleef.jl/master/doc/src/assets/logo.svg"
+alt="SLEEF Logo" width="420"></img> </div>
+
+[![Travis Build Status](https://travis-ci.org/musm/Sleef.jl.svg?branch=master)](https://travis-ci.org/musm/Sleef.jl)
+[![Appveyor Build Status](https://ci.appveyor.com/api/projects/status/j7lpafn4uf1trlfi/branch/master?svg=true)](https://ci.appveyor.com/project/musm/sleef-jl/branch/master)
+[![Coverage Status](https://coveralls.io/repos/github/musm/Sleef.jl/badge.svg?branch=master)](https://coveralls.io/github/musm/Sleef.jl?branch=master)
+[![codecov.io](http://codecov.io/github/musm/Sleef.jl/coverage.svg?branch=master)](http://codecov.io/github/musm/Sleef.jl?branch=master)
+
+A port of the [SLEEF math library](https://github.com/shibatch/sleef) (author Naoki Shibata) in pure Julia. This port includes a few extras including an `exp10` function and many bug fixes over the original code ([see](https://github.com/JuliaMath/Sleef.jl/issues) for a list of remanining issues that have not been fixed upstream). The library supports `Float32` and `Float64` types.
+
+Based on SLEEF version 2.80 (with additional bugfixes).
+
+# Usage
+
+We recommend running Julia with `-O3` for maximal performance using `Sleef.jl` and to also build a custom system image by running
+```julia
+# Pkg.add("WinRPM"); WinRPM.install("gcc")  # on Windows please first run this line
+julia> include(joinpath(dirname(JULIA_HOME), "share", "julia", "build_sysimg.jl"))
+julia> build_sysimg(force=true)
+```
+and then to restart `julia`; this will ensure you are taking full advantage of hardware [FMA](https://en.wikipedia.org/wiki/FMA_instruction_set)  if your CPU supports it.
+
+
+To use  `Sleef.jl`
+```julia
+julia> Pkg.clone("https://github.com/JuliaMath/Sleef.jl.git")
+
+julia> using Sleef
+
+julia> Sleef.sin(2.3)
+0.7457052121767203
+
+julia> Sleef.sin(2.3f0)
+0.74570525f0
+
+julia> Sleef.exp(3.0)
+20.085536923187668
+
+julia> Sleef.exp(3f0)
+20.085537f0
+```
+
+The available functions include (within 1 ulp)
+```julia
+sin, cos, tan, asin, acos, atan, atan2, sincos, sinh, cosh, tanh,
+    asinh, acosh, atanh, log, log2, log10, log1p, ilog2, exp, exp2, exp10, expm1, ldexp, cbrt, pow
+ ```
+ Faster variants include (within 3 ulp)
+
+ ```julia
+sin_fast, cos_fast, tan_fast, sincos_fast, asin_fast, acos_fast, atan_fast, atan2_fast, log_fast, cbrt_fast
+```
+
+# Benchmarks
+
+You can benchmark the performance of the Sleef.jl math library on your machine by running
+
+```julia
+include(joinpath(Pkg.dir("Sleef"), "benchmark", "benchmark.jl"))
+```
diff --git a/REQUIRE b/REQUIRE
new file mode 100644
index 0000000..137767a
--- /dev/null
+++ b/REQUIRE
@@ -0,0 +1 @@
+julia 0.6
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000..ca76fcf
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,32 @@
+environment:
+  matrix:
+  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
+  - JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
+  - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
+  - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
+
+branches:
+  only:
+    - master
+    - /release-.*/
+
+notifications:
+  - provider: Email
+    on_build_success: false
+    on_build_failure: false
+    on_build_status_changed: false
+
+install:
+  - ps: "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12"
+# Download most recent Julia Windows binary
+  - ps: (new-object net.webclient).DownloadFile($env:JULIA_URL, "C:\projects\julia-binary.exe")
+# Run installer silently, output to C:\projects\julia
+  - C:\projects\julia-binary.exe /S /D=C:\projects\julia
+
+build_script:
+# Need to convert from shallow to complete for Pkg.clone to work
+  - IF EXIST .git\shallow (git fetch --unshallow)
+  - C:\projects\julia\bin\julia -e "versioninfo(); Pkg.clone(pwd(), \"Sleef\"); Pkg.build(\"Sleef\")"
+
+test_script:
+  - C:\projects\julia\bin\julia -e "Pkg.test(\"Sleef\")"
\ No newline at end of file
diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl
new file mode 100644
index 0000000..3c53c1a
--- /dev/null
+++ b/benchmark/benchmark.jl
@@ -0,0 +1,132 @@
+using Libm
+using BenchmarkTools
+using JLD, DataStructures
+
+RETUNE  = false
+VERBOSE = true
+DETAILS = false
+
+test_types = (Float64, Float32) # Which types do you want to bench?
+
+const bench = ("Base","Libm")
+const suite = BenchmarkGroup()
+for n in bench
+    suite[n] = BenchmarkGroup([n])
+end
+
+
+bench_reduce(f::Function, X) = mapreduce(x -> reinterpret(Unsigned,x), |, f(x) for x in X)
+
+typealias IEEEFloat Union{Float32,Float64}
+MRANGE(::Type{Float64}) = 10000000
+MRANGE(::Type{Float32}) = 10000
+IntF(::Type{Float64}) = Int64
+IntF(::Type{Float32}) = Int32
+x_trig{T<:IEEEFloat}(::Type{T}) = begin
+    x_trig = T[]
+    for i = 1:10000
+        s = reinterpret(T, reinterpret(IntF(T), T(pi)/4 * i) - IntF(T)(20))
+        e = reinterpret(T, reinterpret(IntF(T), T(pi)/4 * i) + IntF(T)(20))
+        d = s
+        while d <= e 
+            append!(x_trig, d)
+            d = reinterpret(T, reinterpret(IntF(T), d) + IntF(T)(1))
+        end
+    end
+    x_trig = append!(x_trig, -10:0.0002:10)
+    x_trig = append!(x_trig, -MRANGE(T):200.1:MRANGE(T))
+end
+x_exp{T<:IEEEFloat}(::Type{T})        = map(T, vcat(-10:0.0002:10, -1000:0.1:1000))
+x_exp2{T<:IEEEFloat}(::Type{T})       = map(T, vcat(-10:0.0002:10, -120:0.023:1000, -1000:0.02:2000))
+x_exp10{T<:IEEEFloat}(::Type{T})      = map(T, vcat(-10:0.0002:10, -35:0.023:1000, -300:0.01:300))
+x_expm1{T<:IEEEFloat}(::Type{T})      = map(T, vcat(-10:0.0002:10, -1000:0.021:1000, -1000:0.023:1000, 10.0.^-(0:0.02:300), -10.0.^-(0:0.02:300), 10.0.^(0:0.021:300), -10.0.^-(0:0.021:300)))
+x_log{T<:IEEEFloat}(::Type{T})        = map(T, vcat(0.0001:0.0001:10, 0.001:0.1:10000, 1.1.^(-1000:1000), 2.1.^(-1000:1000)))
+x_log10{T<:IEEEFloat}(::Type{T})      = map(T, vcat(0.0001:0.0001:10, 0.0001:0.1:10000))
+x_log1p{T<:IEEEFloat}(::Type{T})      = map(T, vcat(0.0001:0.0001:10, 0.0001:0.1:10000, 10.0.^-(0:0.02:300), -10.0.^-(0:0.02:300)))
+x_atrig{T<:IEEEFloat}(::Type{T})      = map(T, vcat(-1:0.00002:1))
+x_atan{T<:IEEEFloat}(::Type{T})       = map(T, vcat(-10:0.0002:10, -10000:0.2:10000, -10000:0.201:10000))
+x_cbrt{T<:IEEEFloat}(::Type{T})       = map(T, vcat(-10000:0.2:10000, 1.1.^(-1000:1000), 2.1.^(-1000:1000)))
+x_trigh{T<:IEEEFloat}(::Type{T})      = map(T, vcat(-10:0.0002:10, -1000:0.02:1000))
+x_asinhatanh{T<:IEEEFloat}(::Type{T}) = map(T, vcat(-10:0.0002:10, -1000:0.02:1000))
+x_acosh{T<:IEEEFloat}(::Type{T})      = map(T, vcat(1:0.0002:10, 1:0.02:1000))
+x_pow{T<:IEEEFloat}(::Type{T}) = begin
+    xx1 = map(Tuple{T,T}, [(x,y) for x = -100:0.20:100, y = 0.1:0.20:100])[:]
+    xx2 = map(Tuple{T,T}, [(x,y) for x = -100:0.21:100, y = 0.1:0.22:100])[:]
+    xx3 = map(Tuple{T,T}, [(x,y) for x = 2.1, y = -1000:0.1:1000])
+    xx = vcat(xx1, xx2, xx2)
+end
+
+import Base.atanh
+for f in (:atanh,)
+    @eval begin
+        ($f)(x::Float64) = ccall(($(string(f)),Base.libm_name), Float64, (Float64,), x)
+        ($f)(x::Float32) = ccall(($(string(f,"f")),Base.libm_name), Float32, (Float32,), x)
+    end
+end
+
+const micros = OrderedDict(
+    "sin"   => x_trig,
+    "cos"   => x_trig,
+    "tan"   => x_trig,
+    "asin"  => x_atrig,
+    "acos"  => x_atrig,
+    "atan"  => x_atan,
+    "exp"   => x_exp,
+    "exp2"  => x_exp2,
+    "exp10" => x_exp10,
+    "expm1" => x_expm1,
+    "log"   => x_log,
+    "log2"  => x_log10,
+    "log10" => x_log10,
+    "log1p" => x_log1p,
+    "sinh"  => x_trigh,
+    "cosh"  => x_trigh,
+    "tanh"  => x_trigh,
+    "asinh" => x_asinhatanh,
+    "acosh" => x_acosh,
+    "atanh" => x_asinhatanh,
+    "cbrt"  => x_cbrt
+    )
+
+for n in ("Base","Libm")
+    for (f,x) in micros
+        suite[n][f] = BenchmarkGroup([f])
+        for T in test_types
+            fex = Expr(:., Symbol(n), QuoteNode(Symbol(f)))
+            suite[n][f][string(T)] = @benchmarkable bench_reduce($fex, $(x(T)))
+        end
+    end
+end
+
+
+tune_params = joinpath(dirname(@__FILE__), "params.jld")
+if !isfile(tune_params) || RETUNE
+    tune!(suite; verbose=VERBOSE, seconds = 2)
+    save(tune_params, "suite", params(suite))
+    println("Saving tuned parameters.")
+else
+    println("Loading pretuned parameters.")
+    loadparams!(suite, load(tune_params, "suite"), :evals, :samples)
+end
+
+println("Running micro benchmarks...")
+results = run(suite; verbose=VERBOSE, seconds = 2)
+
+print_with_color(:blue, "Benchmarks: median ratio Libm/Base\n")
+for f in keys(micros)
+    print_with_color(:magenta, string(f))
+    for T in test_types
+        println()
+        print("time: ", )
+        tratio = ratio(median(results["Libm"][f][string(T)]), median(results["Base"][f][string(T)])).time
+        tcolor = tratio > 3 ? :red : tratio < 1.5 ? :green : :blue
+        print_with_color(tcolor, @sprintf("%.2f",tratio), " ", string(T))
+        if DETAILS
+            print_with_color(:blue, "details Libm/Base\n")
+            println(results["Libm"][f][string(T)])
+            println(results["Base"][f][string(T)])
+            println()
+        end
+    end
+    println("\n")
+end
diff --git a/doc/src/assets/logo.svg b/doc/src/assets/logo.svg
new file mode 100644
index 0000000..7eb3f3a
--- /dev/null
+++ b/doc/src/assets/logo.svg
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 21.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0 0 400.6 164.6" enable-background="new 0 0 400.6 164.6" xml:space="preserve">
+<polygon fill="#FF6464" points="36.4,50.5 343.1,50.9 343,89 36.3,88.6 "/>
+<g>
+	<rect x="36.3" y="50.7" display="none" fill="#FF5F5F" width="306.8" height="38.1"/>
+	<path fill="#666666" d="M111.2,60.3v3H99.3v-3.7c0-10.1-3.8-16-12.8-16c-9,0-12.8,5.9-12.8,15.8c0,25.1,37.6,27.5,37.6,59
+		c0,17-8.2,28-25.3,28c-17.1,0-25.1-11-25.1-28v-6.4h11.7v7c0,10.1,4.2,16,13.1,16c9,0,13.1-5.9,13.1-16c0-25-37.4-27.4-37.4-58.9
+		c0-17.3,8-28,24.8-28C103.2,32.3,111.2,43.3,111.2,60.3z"/>
+	<path fill="#666666" d="M118.2,145.4v-112h12.5v100.6h32v11.4H118.2z"/>
+	<path fill="#666666" d="M208,83v11.2h-27.4v39.8h33.6v11.4h-46.1v-112h46.1v11.4h-33.6V83H208z"/>
+	<path fill="#666666" d="M261.6,83v11.2h-27.4v39.8h33.6v11.4h-46.1v-112h46.1v11.4h-33.6V83H261.6z"/>
+	<path fill="#666666" d="M313.6,85.6v11.4h-25.8v48.5h-12.5v-112h44.5v11.4h-32v40.8H313.6z"/>
+</g>
+<g>
+	<defs>
+		
+			<rect id="SVGID_1_" x="158.2" y="-96.2" transform="matrix(1.881049e-03 -1 1 1.881049e-03 132.183 246.7499)" width="63" height="306.8"/>
+	</defs>
+	<clipPath id="SVGID_2_">
+		<use xlink:href="#SVGID_1_"  overflow="visible"/>
+	</clipPath>
+	<g clip-path="url(#SVGID_2_)" enable-background="new    ">
+		<path fill="#353535" d="M111.2,60.3v3H99.3v-3.7c0-10.1-3.8-16-12.8-16c-9,0-12.8,5.9-12.8,15.8c0,25.1,37.6,27.5,37.6,59
+			c0,17-8.2,28-25.3,28c-17.1,0-25.1-11-25.1-28v-6.4h11.7v7c0,10.1,4.2,16,13.1,16c9,0,13.1-5.9,13.1-16c0-25-37.4-27.4-37.4-58.9
+			c0-17.3,8-28,24.8-28C103.2,32.3,111.2,43.3,111.2,60.3z"/>
+		<path fill="#353535" d="M118.2,145.4v-112h12.5v100.6h32v11.4H118.2z"/>
+		<path fill="#353535" d="M208,83v11.2h-27.4v39.8h33.6v11.4h-46.1v-112h46.1v11.4h-33.6V83H208z"/>
+		<path fill="#353535" d="M261.6,83v11.2h-27.4v39.8h33.6v11.4h-46.1v-112h46.1v11.4h-33.6V83H261.6z"/>
+		<path fill="#353535" d="M313.6,85.6v11.4h-25.8v48.5h-12.5v-112h44.5v11.4h-32v40.8H313.6z"/>
+	</g>
+</g>
+</svg>
diff --git a/src/Sleef.jl b/src/Sleef.jl
new file mode 100644
index 0000000..3c17936
--- /dev/null
+++ b/src/Sleef.jl
@@ -0,0 +1,88 @@
+__precompile__()
+module Sleef
+
+# export sin, cos, tan, asin, acos, atan, atan2, sincos, sinh, cosh, tanh,
+#     asinh, acosh, atanh, log, log2, log10, log1p, ilog2, exp, exp2, exp10, expm1, ldexp, cbrt, pow
+
+# fast variants (within 3 ulp)
+# export sin_fast, cos_fast, tan_fast, sincos_fast, asin_fast, acos_fast, atan_fast, atan2_fast, log_fast, cbrt_fast
+
+using Base: Math.@horner, Math.exponent_bias, exponent_mask, Math.significand_bits, Math.IEEEFloat
+
+## constants (refactor later to all use dispatch)
+
+const MLN2  = 6.931471805599453094172321214581765680755001343602552541206800094933936219696955e-01 # log(2)
+const MLN2E = 1.442695040888963407359924681001892137426645954152985934135449406931109219181187     # log2(e)
+
+const MPI  = 3.141592653589793238462643383279502884197169399375105820974944592307816406286198     # pi
+const MPI2 = 1.570796326794896619231321691639751442098584699687552910487472296153908203143099     # pi/2
+const MPI4 = 7.853981633974483096156608458198757210492923498437764552437361480769541015715495e-01 # pi/4
+const M1PI = 3.183098861837906715377675267450287240689192914809128974953346881177935952684543e-01 # 1/pi
+const M2PI = 6.366197723675813430755350534900574481378385829618257949906693762355871905369086e-01 # 2/pi
+const M4PI = 1.273239544735162686151070106980114896275677165923651589981338752471174381073817     # 4/pi
+
+const M1SQRT2 = 7.07106781186547524400844362104849039284835937688474036588339868995366239231051e-01 # 1/sqrt(2)
+
+const M2P13 = 1.259921049894873164767210607278228350570251464701507980081975112155299676513956 # 2^1/3
+const M2P23 = 1.587401051968199474751705639272308260391493327899853009808285761825216505624206 # 2^2/3
+
+## constants (dispatch)
+
+MDLN10E(::Type{Float64}) = Double(0.4342944819032518, 1.098319650216765e-17) # log10(e)
+MDLN10E(::Type{Float32}) = Double(0.4342945f0, -1.010305f-8)
+
+MDLN2E(::Type{Float64}) = Double(1.4426950408889634, 2.0355273740931033e-17) # log2(e)
+MDLN2E(::Type{Float32}) = Double(1.442695f0, 1.925963f-8)
+
+MDLN10(::Type{Float64}) = Double(2.302585092994046, -2.1707562233822494e-16) # log(10)
+MDLN10(::Type{Float32}) = Double(2.3025851f0, -3.1975436f-8)
+
+MDLN2(::Type{Float64}) = Double(0.6931471805599453, 2.3190468138462996e-17)  # log(2)
+MDLN2(::Type{Float32}) = Double(0.6931472f0, -1.9046542f-9)
+
+MDPI(::Type{Float64})  = Double(3.141592653589793, 1.2246467991473532e-16) # pi
+MDPI(::Type{Float32})  = Double(3.1415927f0, -8.742278f-8)
+MDPI2(::Type{Float64}) = Double(1.5707963267948966, 6.123233995736766e-17) # pi/2
+MDPI2(::Type{Float32}) = Double(1.5707964f0, -4.371139f-8)
+
+MD2P13(::Type{Float64}) = Double(1.2599210498948732, -2.589933375300507e-17) # 2^1/3
+MD2P13(::Type{Float32}) = Double(1.2599211f0, -2.4018702f-8)
+
+MD2P23(::Type{Float64}) = Double(1.5874010519681996, -1.0869008194197823e-16) # 2^2/3
+MD2P23(::Type{Float32}) = Double(1.587401f0, 1.9520385f-8)
+
+# Split 4/pi into four parts (each is 26 bits)
+PI4A(::Type{Float64}) = 0.78539816290140151978 
+PI4B(::Type{Float64}) = 4.9604678871439933374e-10
+PI4C(::Type{Float64}) = 1.1258708853173288931e-18
+PI4D(::Type{Float64}) = 1.7607799325916000908e-27
+
+PI4A(::Type{Float32}) = 0.78515625f0
+PI4B(::Type{Float32}) = 0.00024187564849853515625f0
+PI4C(::Type{Float32}) = 3.7747668102383613586f-08
+PI4D(::Type{Float32}) = 1.2816720341285448015f-12
+
+# Split log(2) into upper and lower parts
+LN2U(::Type{Float64}) = 0.69314718055966295651160180568695068359375
+LN2L(::Type{Float64}) = 0.28235290563031577122588448175013436025525412068e-12
+
+LN2U(::Type{Float32}) = 0.693145751953125f0
+LN2L(::Type{Float32}) = 1.428606765330187045f-06
+
+include("utils.jl")  # utility functions
+include("double.jl") # Dekker style Double implementation
+include("priv.jl")   # private math functions
+include("exp.jl")    # exponential functions
+include("log.jl")    # logarithmic functions
+include("trig.jl")   # trigonometric and inverse trigonometric functions
+include("hyp.jl")    # hyperbolic and inverse hyperbolic functions
+include("misc.jl")   # miscallenous math functions including pow and cbrt
+
+# fallback definitions #FIXME add all
+for func in (:sin, :cos, :tan, :asin, :acos, :atan, :sinh, :cosh, :tanh,  :log, :log2, :log10, :log1p, :exp, :exp2, :exp10, :expm1)
+    @eval begin
+        $func(x::Real) = $func(float(x))
+    end
+end
+
+end
diff --git a/src/double.jl b/src/double.jl
new file mode 100644
index 0000000..ccf4dd3
--- /dev/null
+++ b/src/double.jl
@@ -0,0 +1,271 @@
+import Base: -, <, copysign, flipsign, normalize, convert
+
+immutable Double{T<:IEEEFloat} <: Number
+    hi::T
+    lo::T
+end
+Double{T<:IEEEFloat}(x::T) = Double(x, zero(T))
+
+convert{T<:IEEEFloat}(::Type{Tuple{T,T}}, x::Double{T}) = (x.hi, x.lo)
+convert{T<:IEEEFloat}(::Type{T}, x::Double) = x.hi + x.lo
+
+
+@inline trunclo(x::Float64) = reinterpret(Float64, reinterpret(UInt64, x) & 0xffff_ffff_f800_0000) # clear lower 27 bits (leave upper 26 bits)
+@inline trunclo(x::Float32) = reinterpret(Float32, reinterpret(UInt32, x) & 0xffff_f000) # clear lowest 12 bits (leave upper 12 bits)
+
+@inline function splitprec(x::IEEEFloat)
+    hx = trunclo(x)
+    hx, x-hx
+end
+
+@inline function normalize{T}(x::Double{T})
+    r = x.hi + x.lo
+    Double(r, (x.hi - r) + x.lo)
+end
+
+@inline flipsign{T<:IEEEFloat}(x::Double{T}, y::T) = Double(flipsign(x.hi, y), flipsign(x.lo, y))
+
+@inline scale{T<:IEEEFloat}(x::Double{T}, s::T) = Double(s*x.hi, s*x.lo)
+
+
+@inline -{T<:IEEEFloat}(x::Double{T}) = Double(-x.hi,-x.lo)
+
+@inline function <{T<:IEEEFloat}(x::Double{T}, y::Double{T})
+    x.hi < y.hi
+end
+
+@inline function <{T<:IEEEFloat}(x::Double{T}, y::Number)
+    x.hi < y
+end
+
+@inline function <{T<:IEEEFloat}(x::Number, y::Double{T})
+    x < y.hi
+end
+
+# quick-two-sum x+y
+@inline function dadd{T<:IEEEFloat}(x::T, y::T) #WARNING |x| >= |y|
+    s = x + y
+    Double(s, (x - s) + y)
+end
+
+@inline function dadd{T<:IEEEFloat}(x::T, y::Double{T}) #WARNING |x| >= |y|
+    s = x + y.hi
+    Double(s, (x - s) + y.hi + y.lo)
+end
+
+@inline function dadd{T<:IEEEFloat}(x::Double{T}, y::T) #WARNING |x| >= |y|
+    s = x.hi + y
+    Double(s, (x.hi - s) + y + x.lo)
+end
+
+@inline function dadd{T<:IEEEFloat}(x::Double{T}, y::Double{T}) #WARNING |x| >= |y|
+    s = x.hi + y.hi
+    Double(s, (x.hi - s) + y.hi + y.lo + x.lo)
+end
+
+@inline function dsub{T<:IEEEFloat}(x::Double{T}, y::Double{T}) #WARNING |x| >= |y|
+    s = x.hi - y.hi
+    Double(s, (x.hi - s) - y.hi - y.lo + x.lo)
+end
+
+@inline function dsub{T<:IEEEFloat}(x::Double{T}, y::T) #WARNING |x| >= |y|
+    s = x.hi - y
+    Double(s, (x.hi - s) - y + x.lo)
+end
+
+@inline function dsub{T<:IEEEFloat}(x::T, y::Double{T}) #WARNING |x| >= |y|
+    s = x - y.hi
+    Double(s, (x - s) - y.hi - y.lo)
+end
+
+@inline function dsub{T<:IEEEFloat}(x::T, y::T) #WARNING |x| >= |y|
+    s = x - y
+    Double(s, (x - s) - y)
+end
+
+
+# two-sum x+y  NO BRANCH
+@inline function dadd2{T<:IEEEFloat}(x::T, y::T)
+    s = x + y
+    v = s - x
+    Double(s, (x - (s - v)) + (y - v))
+end
+
+@inline function dadd2{T<:IEEEFloat}(x::T, y::Double{T})
+    s = x + y.hi
+    v = s - x
+    Double(s, (x - (s - v)) + (y.hi - v) + y.lo)
+end
+
+@inline dadd2{T<:IEEEFloat}(x::Double{T}, y::T) = dadd2(y,x)
+
+@inline function dadd2{T<:IEEEFloat}(x::Double{T}, y::Double{T})
+    s = x.hi + y.hi
+    v = s - x.hi
+    Double(s,(x.hi - (s - v)) + (y.hi - v) + x.lo + y.lo)
+end
+
+@inline function dsub2{T<:IEEEFloat}(x::T, y::T)
+    s = x - y
+    v = s - x
+    Double(s, (x - (s - v)) + (-y - v))
+end
+
+@inline function dsub2{T<:IEEEFloat}(x::T, y::Double{T})
+    s = x - y.hi
+    v = s - x
+    Double(s, (x - (s - v)) + (-y.hi - v) - y.lo)
+end
+
+@inline function dsub2{T<:IEEEFloat}(x::Double{T}, y::T)
+    s = x.hi - y
+    v = s - x.hi
+    Double(s,(x.hi - (s - v)) + (-y - v) + x.lo)
+end
+
+@inline function dsub2{T<:IEEEFloat}(x::Double{T}, y::Double{T})
+    s = x.hi - y.hi
+    v = s - x.hi
+    Double(s,(x.hi - (s - v)) + (-y.hi - v) + x.lo - y.lo)
+end
+
+
+
+if FMA_FAST
+
+    # two-prod-fma
+    @inline function dmul{T<:IEEEFloat}(x::T, y::T)
+        z = x*y
+        Double(z, fma(x, y, -z))
+    end
+
+    @inline function dmul{T<:IEEEFloat}(x::Double{T}, y::T)
+        z = x.hi*y
+        Double(z, fma(x.hi, y, -z) + x.lo*y)
+    end
+
+    @inline dmul{T<:IEEEFloat}(x::T, y::Double{T}) = dmul(y,x)
+
+    @inline function dmul{T<:IEEEFloat}(x::Double{T}, y::Double{T})
+        z = x.hi*y.hi
+        Double(z, fma(x.hi, y.hi, -z) + x.hi*y.lo + x.lo*y.hi)
+    end
+
+    # x^2
+    @inline function dsqu{T<:IEEEFloat}(x::T)
+        z = x*x
+        Double(z, fma(x,x,-z))
+    end
+
+    @inline function dsqu{T<:IEEEFloat}(x::Double{T})
+        z = x.hi*x.hi
+        Double(z, fma(x.hi, x.hi, -z) + x.hi*(x.lo+x.lo))
+    end
+
+    # sqrt(x)
+    @inline function dsqrt{T<:IEEEFloat}(x::Double{T})
+        zhi = _sqrt(x.hi)
+        Double(zhi, (x.lo + fma(-zhi, zhi, x.hi))/(zhi+zhi))
+    end
+
+    # x/y
+    @inline function ddiv{T<:IEEEFloat}(x::Double{T}, y::Double{T})
+        invy = 1/y.hi
+        zhi = x.hi*invy
+        Double(zhi, (fma(-zhi, y.hi, x.hi) + fma(-zhi, y.lo, x.lo))*invy)
+    end
+
+    @inline function ddiv{T<:IEEEFloat}(x::T, y::T)
+        ry = 1/y
+        r = x*ry
+        Double(r, fma(-r,y,x)*ry)
+    end
+
+    # 1/x
+    @inline function ddrec{T<:IEEEFloat}(x::T)
+        zhi = 1/x
+        Double(zhi, fma(-zhi, x, one(T))*zhi)
+    end
+
+    @inline function ddrec{T<:IEEEFloat}(x::Double{T})
+        zhi = 1/x.hi
+        Double(zhi, (fma(-zhi, x.hi, one(T)) + -zhi*x.lo)*zhi)
+    end
+
+else
+
+    #two-prod x*y
+    @inline function dmul{T<:IEEEFloat}(x::T, y::T)
+        hx, lx = splitprec(x)
+        hy, ly = splitprec(y)
+        z = x*y
+        Double(z, ((hx*hy-z) + lx*hy + hx*ly) + lx*ly)
+    end
+
+    @inline function dmul{T<:IEEEFloat}(x::Double{T}, y::T)
+        hx, lx = splitprec(x.hi)
+        hy, ly = splitprec(y)
+        z = x.hi*y
+        Double(z, (hx*hy-z) + lx*hy + hx*ly + lx*ly + x.lo*y)
+    end
+
+    @inline dmul{T<:IEEEFloat}(x::T, y::Double{T}) = dmul(y,x)
+
+    @inline function dmul{T<:IEEEFloat}(x::Double{T}, y::Double{T})
+        hx, lx = splitprec(x.hi)
+        hy, ly = splitprec(y.hi)
+        z = x.hi*y.hi
+        Double(z, (((hx*hy-z) + lx*hy + hx*ly) + lx*ly) + x.hi*y.lo + x.lo*y.hi)
+    end
+
+    # x^2
+    @inline function dsqu{T<:IEEEFloat}(x::T)
+        hx, lx = splitprec(x)
+        z = x*x
+        Double(z, (hx*hx-z) + lx*(hx + hx) + lx*lx)
+    end
+
+    @inline function dsqu{T<:IEEEFloat}(x::Double{T})
+        hx, lx = splitprec(x.hi)
+        z = x.hi*x.hi
+        Double(z, (hx*hx-z) + lx*(hx+hx) + lx*lx + x.hi*(x.lo+x.lo))
+    end
+
+    # sqrt(x)
+    @inline function dsqrt{T<:IEEEFloat}(x::Double{T})
+        c = _sqrt(x.hi)
+        u = dsqu(c)
+        Double(c, (x.hi - u.hi - u.lo + x.lo)/(c+c))
+    end
+
+    # x/y
+    @inline function ddiv{T<:IEEEFloat}(x::Double{T}, y::Double{T})
+        invy = 1/y.hi
+        c = x.hi*invy
+        u = dmul(c, y.hi)
+        Double(c,((((x.hi - u.hi) - u.lo) + x.lo) - c*y.lo)*invy)
+    end
+
+    @inline function ddiv{T<:IEEEFloat}(x::T, y::T)
+        ry = 1/y
+        r = x*ry
+        hx, lx = splitprec(r)
+        hy, ly = splitprec(y)
+        Double(r, (((-hx*hy+r*y) - lx*hy - hx*ly) - lx*ly)*ry)
+    end
+
+
+    # 1/x
+    @inline function ddrec{T<:IEEEFloat}(x::T)
+        c = 1/x
+        u = dmul(c,x)
+        Double(c, (one(T) - u.hi - u.lo)*c)
+    end
+
+    @inline function ddrec{T<:IEEEFloat}(x::Double{T})
+        c = 1/x.hi
+        u = dmul(c,x.hi)
+        Double(c, (one(T) - u.hi - u.lo - c*x.lo)*c)
+    end
+
+end
diff --git a/src/exp.jl b/src/exp.jl
new file mode 100644
index 0000000..f81a21b
--- /dev/null
+++ b/src/exp.jl
@@ -0,0 +1,106 @@
+# exported exponential functions
+
+"""
+    ldexp(a, n::Int) -> IEEEFloat
+
+Computes `a × 2^n`
+"""
+ldexp(x::IEEEFloat, q::Int) = ldexpk(x,q)
+
+
+
+over_e2(::Type{Float64}) = 1024
+over_e2(::Type{Float32}) = 128f0
+
+"""
+    exp2(x)
+
+Compute the base-`2` exponential of `x`, that is `2ˣ`.
+"""
+function exp2{T<:IEEEFloat}(x::T)
+    u = expk(dmul(MDLN2(T), x))
+    x > over_e2(T) && (u = T(Inf))
+    isninf(x) && (u = T(0))
+    return u
+end
+
+
+
+over_e10(::Type{Float64}) = 308
+over_e10(::Type{Float32}) = 38f0
+
+"""
+    exp10(x)
+
+Compute the base-`10` exponential of `x`, that is `10ˣ`.
+"""
+function exp10{T<:IEEEFloat}(x::T)
+    u = expk(dmul(MDLN10(T), x))
+    x > over_e10(T) && (u = T(Inf))
+    isninf(x) && (u = T(0))
+    return u
+end
+
+
+
+over_em1(::Type{Float64}) = 700.0
+over_em1(::Type{Float32}) = 88f0
+under_em1(::Type{Float64}) = -0.36043653389117156089696070315825181539851971360337e2
+under_em1(::Type{Float32}) = -0.15942385152878742116596338793538061065739925620174f2
+
+"""
+    expm1(x)
+
+Compute `eˣ- 1` accurately for small values of `x`.
+"""
+function expm1{T<:IEEEFloat}(x::T)
+    u = T(dadd2(expk2(Double(x)), -T(1)))
+    x > over_em1(T) && (u = T(Inf))
+    x < under_em1(T) && (u = -T(1))
+    return u
+end
+
+
+
+"""
+    exp(x)
+
+Compute the base-`e` exponential of `x`, that is `eˣ`.
+"""
+function exp end
+
+let
+global exp
+
+const c11d = 2.08860621107283687536341e-09
+const c10d = 2.51112930892876518610661e-08
+const c9d  = 2.75573911234900471893338e-07
+const c8d  = 2.75572362911928827629423e-06
+const c7d  = 2.4801587159235472998791e-05
+const c6d  = 0.000198412698960509205564975
+const c5d  = 0.00138888888889774492207962
+const c4d  = 0.00833333333331652721664984
+const c3d  = 0.0416666666666665047591422
+const c2d  = 0.166666666666666851703837
+const c1d  = 0.50
+
+const c5f = 0.00136324646882712841033936f0
+const c4f = 0.00836596917361021041870117f0
+const c3f = 0.0416710823774337768554688f0
+const c2f = 0.166665524244308471679688f0
+const c1f = 0.499999850988388061523438f0
+
+global @inline _exp(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d  
+global @inline _exp(x::Float32) = @horner x c1f c2f c3f c4f c5f
+
+function exp{T<:IEEEFloat}(x::T)
+    q = roundi(T(MLN2E)*x)
+    s = muladd(q, -LN2U(T), x)
+    s = muladd(q, -LN2L(T), s)
+    u =_exp(s)
+    u = s*s*u + s + 1
+    u = ldexpk(u,q)
+    isninf(x) && (u = T(0))
+    return u
+end
+end
diff --git a/src/hyp.jl b/src/hyp.jl
new file mode 100644
index 0000000..8c30aa7
--- /dev/null
+++ b/src/hyp.jl
@@ -0,0 +1,113 @@
+# exported hyperbolic functions
+
+over_sch(::Type{Float64}) = 710.0
+over_sch(::Type{Float32}) = 89f0
+
+"""
+    sinh(x)
+
+Compute hyperbolic sine of `x`.
+"""
+function sinh{T<:IEEEFloat}(x::T)
+    u = abs(x)
+    d = expk2(Double(u))
+    d = dsub(d, ddrec(d))
+    u = T(d)*T(0.5)
+    u = abs(x) > over_sch(T) ? T(Inf) : u
+    u = isnan(u) ? T(Inf) : u
+    u = flipsign(u,x)
+    u = isnan(x) ? T(NaN) : u
+    return u
+end
+
+
+
+"""
+    cosh(x)
+
+Compute hyperbolic cosine of `x`.
+"""
+function cosh{T<:IEEEFloat}(x::T)
+    u = abs(x)
+    d = expk2(Double(u))
+    d = dadd(d, ddrec(d))
+    u = T(d)*T(0.5)
+    u = abs(x) > over_sch(T) ? T(Inf) : u
+    u = isnan(u) ? T(Inf) : u
+    u = isnan(x) ? T(NaN) : u
+    return u
+end
+
+
+
+over_th(::Type{Float64}) = 18.714973875
+over_th(::Type{Float32}) = 8.664339742f0
+
+"""
+    tanh(x)
+
+Compute hyperbolic tangent of `x`.
+"""
+function tanh{T<:IEEEFloat}(x::T)
+    u = abs(x)
+    d = expk2(Double(u))
+    e = ddrec(d)
+    d = ddiv(dsub(d,e), dadd(d,e))
+    u = T(d)
+    u = abs(x) > over_th(T) ? T(1) : u
+    u = isnan(u) ? T(1) : u
+    u = flipsign(u,x)
+    u = isnan(x) ? T(NaN) : u
+    return u
+end
+
+
+
+"""
+    asinh(x)
+
+Compute the inverse hyperbolic sine of `x`.
+"""
+function asinh{T<:IEEEFloat}(x::T)
+    u = abs(x)
+    d = logk2(dadd(dsqrt(dadd2(dsqu(u), T(1))), u))
+    u = T(d)
+    u = isinf(x) || isnan(u) ? T(Inf) : u
+    u = flipsign(u,x)
+    u = isnan(x) ? T(NaN) : u
+    return u
+end
+
+
+
+"""
+    acosh(x)
+
+Compute the inverse hyperbolic cosine of `x`.
+"""
+function acosh{T<:IEEEFloat}(x::T)
+    d = logk2(dadd2(dsqrt(dsub2(dsqu(x), T(1))), x))
+    u = T(d)
+    u = isinf(x) || isnan(u) ? T(Inf) : u
+    u = x == T(1) ? T(0) : u
+    u = x < T(1) ? T(NaN) : u
+    u = isnan(x) ? T(NaN) : u
+    return u
+end
+
+
+
+"""
+    atanh(x)
+
+Compute the inverse hyperbolic tangent of `x`.
+"""
+function atanh{T<:IEEEFloat}(x::T)
+    u = abs(x)
+    d = logk2(ddiv(dadd2(T(1), u), dsub2(T(1), u)))
+    u = u > T(1) ? T(NaN) : (u == T(1) ? T(Inf) : T(d)*T(0.5))
+    u = isinf(x) || isnan(u) ? T(NaN) : u
+    u = flipsign(u,x)
+    u = isnan(x) ? T(NaN) : u
+    return u
+end
diff --git a/src/log.jl b/src/log.jl
new file mode 100644
index 0000000..d2708e8
--- /dev/null
+++ b/src/log.jl
@@ -0,0 +1,139 @@
+# exported logarithmic functions
+
+"""
+    ilog2(x)
+
+Returns the integral part of the logarithm of `abs(x)`, using base 2 for the
+logarithm. In other words, this computes the binary exponent of `x` such that
+
+    x = significand × 2^exponent,
+
+where `significand ∈ [1, 2)`.
+
+* Exceptional cases (where `Int` is the machine wordsize)
+    * `x = 0`    returns `typemin(Int)`
+    * `x = Inf`  returns `typemax(Int)`
+    * `x = NaN`  returns `typemax(Int)`
+"""
+function ilog2{T<:IEEEFloat}(x::T)
+    e = ilog2k(abs(x)) - 1
+    isnan(x) && (e = typemax(Int))
+    isinf(x) && (e = typemax(Int))
+    x == 0 && (e = typemin(Int))
+    return e
+end
+
+
+"""
+    log10(x)
+
+Returns the base `10` logarithm of `x`.
+"""
+function log10{T<:IEEEFloat}(x::T)
+    u = T(dmul(logk(x), MDLN10E(T)))
+    isinf(x) && (u = T(Inf))
+    x < 0  && (u = T(NaN))
+    x == 0 && (u = T(-Inf))
+    return u
+end
+
+
+"""
+    log2(x)
+
+Returns the base `2` logarithm of `x`.
+"""
+function log2{T<:IEEEFloat}(x::T)
+    u = T(dmul(logk(x), MDLN2E(T)))
+    isinf(x) && (u = T(Inf))
+    x < 0  && (u = T(NaN))
+    x == 0 && (u = T(-Inf))
+    return u
+end
+
+
+"""
+    log1p(x)
+
+Accurately compute the natural logarithm of 1+x.
+"""
+function log1p{T<:IEEEFloat}(x::T)
+    u = T(logk2(dadd2(x, T(1))))
+    isinf(x) && (u = T(Inf))
+    x < -1 && (u = T(NaN))
+    x == -1 && (u = -T(Inf))
+    return copysign(u,x) # return correct sign for -0.0
+end
+
+
+"""
+    log(x)
+
+Compute the natural logarithm of `x`. The inverse of the natural logarithm is
+the natural expoenential function `exp(x)`
+"""
+function log{T<:IEEEFloat}(x::T)
+    u = T(logk(x))
+    isinf(x) && (u = T(Inf))
+    x < 0  && (u = T(NaN))
+    x == 0 && (u = -T(Inf))
+    return u
+end
+
+# First we split the argument to its mantissa `m` and integer exponent `e` so
+# that `d = m \times 2^e`, where `m \in [0.5, 1)` then we apply the polynomial
+# approximant on this reduced argument `m` before putting back the exponent
+# in. This first part is done with the help of the private function
+# `ilog2k(x)` and we put the exponent back using
+
+#     `\log(m \times 2^e) = \log(m) + \log 2^e =  \log(m) + e\times MLN2
+
+# The polynomial we evaluate is based on coefficients from
+
+#     `log_2(x) = 2\sum_{n=0}^\infty \frac{1}{2n+1} \bigl(\frac{x-1}{x+1}^{2n+1}\bigr)`
+
+# That being said, since this converges faster when the argument is close to
+# 1, we multiply  `m` by `2` and subtract 1 for the exponent `e` when `m` is
+# less than `sqrt(2)/2`
+"""
+    log_fast(x)
+
+Compute the natural logarithm of `x`. The inverse of the natural logarithm is
+the natural expoenential function `exp(x)`
+"""
+function log_fast end
+
+let
+global log_fast
+
+const c8d = 0.148197055177935105296783
+const c7d = 0.153108178020442575739679
+const c6d = 0.181837339521549679055568
+const c5d = 0.22222194152736701733275
+const c4d = 0.285714288030134544449368
+const c3d = 0.399999999989941956712869
+const c2d = 0.666666666666685503450651
+const c1d = 2.0
+
+const c5f = 0.2371599674224853515625f0
+const c4f = 0.285279005765914916992188f0
+const c3f = 0.400005519390106201171875f0
+const c2f = 0.666666567325592041015625f0
+const c1f = 2f0
+
+global @inline _log_fast(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d
+global @inline _log_fast(x::Float32) = @horner x c1f c2f c3f c4f c5f
+
+function log_fast{T<:IEEEFloat}(x::T)
+    e  = ilog2k(T(M1SQRT2)*x)
+    m  = ldexpk(x,-e)
+    u  = (m-1)/(m+1)
+    u2 = u*u
+    t  =_log_fast(u2)
+    u  = muladd(u, t, T(MLN2)*e)
+    isinf(x) && (u = T(Inf))
+    x < 0 && (u = T(NaN))
+    x == 0 && (u = -T(Inf))
+    return u
+end
+end
diff --git a/src/misc.jl b/src/misc.jl
new file mode 100644
index 0000000..0a20ea4
--- /dev/null
+++ b/src/misc.jl
@@ -0,0 +1,119 @@
+
+"""
+  pow(x, y)
+
+Exponentiation operator, returns `x` raised to the power `y`.
+"""
+function pow{T<:IEEEFloat}(x::T, y::T)
+    yi = unsafe_trunc(Int,y)
+    yint = yi == y
+    yodd = isodd(yi) && yint
+    z = expk(dmul(logk(abs(x)), y))
+    z = isnan(z) ? T(Inf) : z
+    z *= (x >= 0 ? T(1) : (!yint ? T(NaN) : (yodd ? -T(1) : T(1))))
+    efx = flipsign(abs(x) - 1, y)
+    isinf(y) && (z = efx < 0 ? T(0) : (efx == 0 ? T(1) : T(Inf)))
+    if isinf(x) || x == 0
+        z = (yodd ? _sign(x) : T(1)) * ((x == 0 ? -y : y) < 0 ? T(0) : T(Inf))
+    end
+    (isnan(x) || isnan(y)) && (z = T(NaN))
+    (y == 0 || x == 1) && (z = T(1))
+    return z
+end
+
+
+
+let
+global cbrt_fast
+global cbrt
+
+const c6d = -0.640245898480692909870982
+const c5d = 2.96155103020039511818595
+const c4d = -5.73353060922947843636166
+const c3d = 6.03990368989458747961407
+const c2d = -3.85841935510444988821632
+const c1d = 2.2307275302496609725722
+
+const c6f = -0.601564466953277587890625f0
+const c5f =  2.8208892345428466796875f0
+const c4f = -5.532182216644287109375f0
+const c3f =  5.898262500762939453125f0
+const c2f = -3.8095417022705078125f0
+const c1f =  2.2241256237030029296875f0
+
+global @inline _cbrt(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d
+global @inline _cbrt(x::Float32) = @horner x c1f c2f c3f c4f c5f c6f
+
+"""
+    cbrt_fast(x)
+
+Return `x^{1/3}`.
+"""
+function cbrt_fast{T<:IEEEFloat}(d::T)
+    e  = ilog2k(d)
+    d  = ldexpk(d,-e)
+    r  = (e + 6144) % 3
+    q  = r == 1 ? T(M2P13) : T(1)
+    q  = r == 2 ? T(M2P23) : q
+    q  = ldexpk(q, (e + 6144) ÷ 3 - 2048)
+    q  = flipsign(q,d)
+    d  = abs(d)
+    x  =_cbrt(d)
+    y  = x*x
+    y  = y*y
+    x -= (d*y - x)*T(1/3)
+    y  = d*x*x
+    y  = (y - T(2/3)*y*(y*x - 1))*q
+end
+
+
+"""
+    cbrt(x)
+
+Return `x^{1/3}`. The prefix operator `∛` is equivalent to `cbrt`.
+"""
+function cbrt{T<:IEEEFloat}(d::T)
+    e  = ilog2k(d)
+    d  = ldexpk(d,-e)
+    r  = (e + 6144) % 3
+    q2 = r == 1 ? MD2P13(T) : Double(T(1))
+    q2 = r == 2 ? MD2P23(T) : q2
+    q2 = flipsign(q2,d)
+    d  = abs(d)
+    x  =_cbrt(d)
+    y  = x*x
+    y  = y*y
+    x -= (d*y - x)*T(1/3)
+    z  = x
+    u  = dsqu(x)
+    u  = dsqu(u)
+    u  = dmul(u, d)
+    u  = dsub(u,x)
+    y  = T(u)
+    y  = -T(2/3)*y*z
+    v  = dadd(dsqu(z), y)
+    v  = dmul(v,d)
+    v  = dmul(v,q2)
+    z  = ldexp(T(v), (e + 6144) ÷ 3 - 2048)
+    isinf(d) && (z = flipsign(T(Inf), q2.hi))
+    d == 0   && (z = flipsign(T(0), q2.hi))
+    return z
+end
+end
+
+
+
+"""
+    hypot(x,y)
+
+Compute the hypotenuse `\sqrt{x^2+y^2}` avoiding overflow and underflow.
+"""
+function hypot{T<:IEEEFloat}(x::T, y::T)
+    x = abs(x)
+    y = abs(y)
+    if x < y
+       x, y = y, x
+    end
+    r = (x == 0) ? y : y/x
+    x*sqrt(T(1) + r*r)
+end
diff --git a/src/priv.jl b/src/priv.jl
new file mode 100644
index 0000000..17fae2c
--- /dev/null
+++ b/src/priv.jl
@@ -0,0 +1,231 @@
+# private math functions
+
+"""
+A helper function for `ldexpk`
+
+First note that `r = (q >> n) << n` clears the lowest n bits of q, i.e. returns 2^n where n is the
+largest integer such that q >= 2^n
+
+For numbers q less than 2^m the following code does the same as the above snippet
+    `r = ( (q>>v + q) >> n - q>>v ) << n`
+For numbers larger than or equal to 2^v this subtracts 2^n from q for q>>n times.
+
+The function returns q(input) := q(output) + offset*r
+
+In the code for ldexpk we actually use
+    `m = ( (m>>n + m) >> n -  m>>m ) << (n-2)`.
+So that x has to be multplied by u four times `x = x*u*u*u*u` to put the value  of the offset
+exponent amount back in.
+"""
+@inline function _split_exponent(q, n, v, offset)
+    m = q >> v
+    m = (((m + q) >> n) - m) << (n-offset)
+    q = q - (m << offset)
+    m, q
+end
+@inline split_exponent(::Type{Float64}, q::Int) = _split_exponent(q, UInt(9), UInt(31), UInt(2))
+@inline split_exponent(::Type{Float32}, q::Int) = _split_exponent(q, UInt(6), UInt(31), UInt(2))
+
+"""
+    ldexpk(a::IEEEFloat, n::Int) -> IEEEFloat
+
+Computes `a × 2^n`.
+"""
+@inline function ldexpk{T<:IEEEFloat}(x::T, q::Int)
+    bias = exponent_bias(T)
+    emax = exponent_max(T)
+    m, q = split_exponent(T,q)
+    m += bias
+    m = ifelse(m < 0, 0, m)
+    m = ifelse(m > emax, emax, m)
+    q += bias
+    u = integer2float(T, m)
+    x = x*u*u*u*u
+    u = integer2float(T, q)
+    x*u
+end
+
+@inline ldexpk2{T<:IEEEFloat}(x::T, e::Int) = x * pow2i(T, e >> UInt(1)) * pow2i(T, e - (e >> UInt(1)))
+
+
+
+# The following define threshold values for `ilog2k`
+threshold_exponent(::Type{Float64}) = 300
+threshold_exponent(::Type{Float32}) = 64
+
+"""
+    ilog2k(x::IEEEFloat) -> Int
+
+Returns the integral part of the logarithm of `|x|`, using 2 as base for the logarithm; in other
+words this returns the binary exponent of `x` so that
+
+    x = significand × 2^exponent
+
+where `significand ∈ [0.5, 1)`.
+"""
+@inline function ilog2k{T<:IEEEFloat}(d::T)
+    m = d < T(2)^-threshold_exponent(T)
+    d = ifelse(m, d*T(2)^threshold_exponent(T), d)
+    q = float2integer(d) & exponent_max(T)
+    q = ifelse(m, q - (threshold_exponent(T) + exponent_bias(T) - 1), q - (exponent_bias(T) - 1)) # subtract 1 since we want 2^q
+end
+
+
+let
+const c20d =  1.06298484191448746607415e-05
+const c19d = -0.000125620649967286867384336
+const c18d =  0.00070557664296393412389774
+const c17d = -0.00251865614498713360352999
+const c16d =  0.00646262899036991172313504
+const c15d = -0.0128281333663399031014274
+const c14d =  0.0208024799924145797902497
+const c13d = -0.0289002344784740315686289
+const c12d =  0.0359785005035104590853656
+const c11d = -0.041848579703592507506027
+const c10d =  0.0470843011653283988193763
+const c9d  = -0.0524914210588448421068719
+const c8d  =  0.0587946590969581003860434
+const c7d  = -0.0666620884778795497194182
+const c6d  =  0.0769225330296203768654095
+const c5d  = -0.0909090442773387574781907
+const c4d  =  0.111111108376896236538123
+const c3d  = -0.142857142756268568062339
+const c2d  =  0.199999999997977351284817
+const c1d =  -0.333333333333317605173818
+
+const c9f = -0.00176397908944636583328247f0
+const c8f =  0.0107900900766253471374512f0
+const c7f = -0.0309564601629972457885742f0
+const c6f =  0.0577365085482597351074219f0
+const c5f = -0.0838950723409652709960938f0
+const c4f =  0.109463557600975036621094f0
+const c3f = -0.142626821994781494140625f0
+const c2f =  0.199983194470405578613281f0
+const c1f = -0.333332866430282592773438f0
+
+global @inline _atan2k_fast(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
+global @inline _atan2k_fast(x::Float32) = @horner x c1f c2f c3f c4f c5f c6f c7f c8f c9f
+
+global @inline function atan2k_fast{T<:IEEEFloat}(y::T, x::T)
+    q = 0
+    if x < 0
+        x = -x
+        q = -2
+    end
+    if y > x
+        t = x; x = y
+        y = -t
+        q += 1
+    end
+    s = y/x
+    t = s*s
+    u =_atan2k_fast(t)
+    t = u*t*s + s
+    q*T(MPI2) + t
+end
+
+
+global @inline _atan2k(x::Double{Float64}) = @horner x.hi c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d c20d
+global @inline _atan2k(x::Double{Float32}) = dadd(c1f, x.hi*(@horner x.hi c2f c3f c4f c5f c6f c7f c8f c9f))
+
+global @inline function atan2k{T<:IEEEFloat}(y::Double{T}, x::Double{T})
+    q = 0
+    if x < 0
+        x = -x
+        q = -2
+    end
+    if y > x
+        t = x; x = y
+        y = -t
+        q += 1
+    end
+    s = ddiv(y,x)
+    t = dsqu(s)
+    u =_atan2k(t)
+    t = dmul(s, dadd(T(1), dmul(t, u)))
+    dadd(dmul(T(q), MDPI2(T)), t)
+end
+end
+
+
+let
+const c10d = 2.51069683420950419527139e-08
+const c9d  = 2.76286166770270649116855e-07
+const c8d  = 2.75572496725023574143864e-06
+const c7d  = 2.48014973989819794114153e-05
+const c6d  = 0.000198412698809069797676111
+const c5d  = 0.0013888888939977128960529
+const c4d  = 0.00833333333332371417601081
+const c3d  = 0.0416666666665409524128449
+const c2d  = 0.166666666666666740681535
+const c1d  = 0.500000000000000999200722
+
+const c5f = 0.00136324646882712841033936f0
+const c4f = 0.00836596917361021041870117f0
+const c3f = 0.0416710823774337768554688f0
+const c2f = 0.166665524244308471679688f0
+const c1f = 0.499999850988388061523438f0
+
+global @inline _expk(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d
+global @inline _expk(x::Float32) = @horner x c1f c2f c3f c4f c5f
+
+global @inline function expk{T<:IEEEFloat}(d::Double{T})
+    q = round(T(d)*T(MLN2E))
+    s = dadd(d, -q*LN2U(T))
+    s = dadd(s, -q*LN2L(T))
+    u =_expk(T(s))
+    t = dadd(s, dmul(dsqu(s), u))
+    t = dadd(T(1), t)
+    ldexpk(T(t), unsafe_trunc(Int,q))
+end
+
+
+global @inline function expk2{T<:IEEEFloat}(d::Double{T})
+    q = round(T(d)*T(MLN2E))
+    s = dadd(d, -q*LN2U(T))
+    s = dadd(s, -q*LN2L(T))
+    u =_expk(s.hi)
+    t = dadd(s, dmul(dsqu(s), u))
+    t = dadd(T(1), t)
+    scale(t, pow2i(T, unsafe_trunc(Int,q)))
+end
+end
+
+
+let
+const c8d = 0.134601987501262130076155
+const c7d = 0.132248509032032670243288
+const c6d = 0.153883458318096079652524
+const c5d = 0.181817427573705403298686
+const c4d = 0.222222231326187414840781
+const c3d = 0.285714285651261412873718
+const c2d = 0.400000000000222439910458
+const c1d = 0.666666666666666371239645
+
+const c4f = 0.2371599674224853515625f0
+const c3f = 0.285279005765914916992188f0
+const c2f = 0.400005519390106201171875f0
+const c1f = 0.666666567325592041015625f0
+
+global @inline _logk(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d
+global @inline _logk(x::Float32) = @horner x c1f c2f c3f c4f
+
+global @inline function logk{T<:IEEEFloat}(d::T)
+    e  = ilog2k(d*T(M1SQRT2))
+    m  = ldexpk(d,-e)
+    x  = ddiv(dsub2(m, T(1)), dadd2(T(1), m))
+    x2 = dsqu(x)
+    t  =_logk(x2.hi)
+    dadd(dmul(MDLN2(T), T(e)), dadd(scale(x, T(2)), dmul(dmul(x2, x), t)))
+end
+
+
+global @inline function logk2{T<:IEEEFloat}(d::Double{T})
+    e  = ilog2k(d.hi*T(M1SQRT2))
+    m  = scale(d, pow2i(T,-e))
+    x  = ddiv(dsub2(m, T(1)), dadd2(m, T(1)))
+    x2 = dsqu(x)
+    t  =_logk(x2.hi)
+    dadd(dmul(MDLN2(T), T(e)), dadd(scale(x, T(2)), dmul(dmul(x2, x), t)))
+end
+end
diff --git a/src/trig.jl b/src/trig.jl
new file mode 100644
index 0000000..0621572
--- /dev/null
+++ b/src/trig.jl
@@ -0,0 +1,493 @@
+# exported trigonometric functions
+
+"""
+    sin(x)
+
+Compute the sine of `x`, where the output is in radians.
+"""
+function sin end
+
+"""
+    cos(x)
+
+Compute the cosine of `x`, where the output is in radians.
+"""
+function cos end
+
+let
+global sin
+global cos
+
+const c8d =  2.72052416138529567917983e-15
+const c7d = -7.64292594113954471900203e-13
+const c6d =  1.60589370117277896211623e-10
+const c5d = -2.5052106814843123359368e-08
+const c4d =  2.75573192104428224777379e-06
+const c3d = -0.000198412698412046454654947
+const c2d =  0.00833333333333318056201922
+const c1d = -0.166666666666666657414808
+
+const c4f =  2.6083159809786593541503f-06
+const c3f = -0.0001981069071916863322258f0
+const c2f =  0.00833307858556509017944336f0
+const c1f = -0.166666597127914428710938f0
+
+global @inline _sincos(x::Double{Float64}) = dadd(c1d, x.hi*(@horner x.hi c2d c3d c4d c5d c6d c7d c8d))
+global @inline _sincos(x::Double{Float32}) = dadd(c1f, x.hi*(@horner x.hi c2f c3f c4f))
+
+function sin{T<:IEEEFloat}(x::T)
+    d = abs(x)
+    q = round(d*T(M1PI))
+    s = dsub2(d, q*PI4A(T)*4)
+    s = dsub2(s, q*PI4B(T)*4)
+    s = dsub2(s, q*PI4C(T)*4)
+    s = dsub2(s, q*PI4D(T)*4)
+    t = s
+    s = dsqu(s)
+    w =_sincos(s)
+    v = dmul(t, dadd(T(1), dmul(w,s)))
+    u = T(v)
+    qi = unsafe_trunc(Int,q)
+    qi & 1 != 0 && (u = -u)
+    return flipsign(u,x)
+end
+
+function cos{T<:IEEEFloat}(x::T)
+    x = abs(x)
+    q = muladd(T(2), round(x*T(M1PI) - T(0.5)), T(1))
+    s = dsub2(x, q*PI4A(T)*2)
+    s = dsub2(s, q*PI4B(T)*2)
+    s = dsub2(s, q*PI4C(T)*2)
+    s = dsub2(s, q*PI4D(T)*2)
+    t = s
+    s = dsqu(s)
+    w =_sincos(s)
+    v = dmul(t, dadd(T(1), dmul(w,s)))
+    u = T(v)
+    qi = unsafe_trunc(Int,q)
+    qi & 2 == 0 && (u = -u)
+    return u
+end
+end
+
+
+"""
+    sin_fast(x)
+
+Compute the sine of `x`, where the output is in radians.
+"""
+function sin_fast end
+
+"""
+    cos_fast(x)
+
+Compute the cosine of `x`, where the output is in radians.
+"""
+function cos_fast end
+
+let 
+global sin_fast
+global cos_fast
+
+const c9d = -7.97255955009037868891952e-18
+const c8d =  2.81009972710863200091251e-15
+const c7d = -7.64712219118158833288484e-13
+const c6d =  1.60590430605664501629054e-10
+const c5d = -2.50521083763502045810755e-08
+const c4d =  2.75573192239198747630416e-06
+const c3d = -0.000198412698412696162806809
+const c2d =  0.00833333333333332974823815
+const c1d = -0.166666666666666657414808
+
+# c5f is 0f0 to handle Inf32 case, Float64 doesn't need this since it comes
+# out properly (add another neg constant and remove this zero constant)
+const c5f =  0f0
+const c4f =  2.6083159809786593541503f-06
+const c3f = -0.0001981069071916863322258f0
+const c2f =  0.00833307858556509017944336f0
+const c1f = -0.166666597127914428710938f0
+
+# Argument is first reduced to the domain 0 < s < π/4
+
+# We return the correct sign using `q & 1 != 0` i.e. q is odd (this works for
+# positive and negative q) and if this condition is true we flip the sign since
+# we are now in the negative branch of sin(x). Recall that q is just the integer
+# part of d/π and thus we can determine the correct sign using this information.
+
+global @inline _sincos_fast(x::Float64) = @horner x c1d c2d c3d c4d c5d c6d c7d c8d c9d
+global @inline _sincos_fast(x::Float32) = @horner x c1f c2f c3f c4f c5f
+
+function sin_fast{T<:IEEEFloat}(x::T)
+    d = abs(x)
+    q = roundi(d*T(M1PI))
+    d = muladd(q, -PI4A(T)*4, d)
+    d = muladd(q, -PI4B(T)*4, d)
+    d = muladd(q, -PI4C(T)*4, d)
+    d = muladd(q, -PI4D(T)*4, d)
+    s = d*d
+    q & 1 != 0 && (d = -d)
+    u =_sincos_fast(s)
+    u = muladd(s, u*d, d)
+    flipsign(u,x)
+end
+
+function cos_fast{T<:IEEEFloat}(x::T)
+    q = muladd(2, roundi(x*T(M1PI)-T(0.5)), 1)
+    x = muladd(q, -PI4A(T)*2, x)
+    x = muladd(q, -PI4B(T)*2, x)
+    x = muladd(q, -PI4C(T)*2, x)
+    x = muladd(q, -PI4D(T)*2, x)
+    s = x*x
+    q & 2 == 0 && (x = -x)
+    u =_sincos_fast(s)
+    muladd(s, u*x, x)
+end
+end
+
+
+
+"""
+    sincos(x)
+
+Compute the sin and cosine of `x` simultaneously, where the output is in
+radians, returning a tuple.
+"""
+function sincos end
+
+"""
+    sincos_fast(x)
+
+Compute the sin and cosine of `x` simultaneously, where the output is in
+radians, returning a tuple.
+"""
+function sincos_fast end
+
+let
+global sincos
+global sincos_fast
+
+const a6d =  1.58938307283228937328511e-10
+const a5d = -2.50506943502539773349318e-08
+const a4d =  2.75573131776846360512547e-06
+const a3d = -0.000198412698278911770864914
+const a2d =  0.0083333333333191845961746
+const a1d = -0.166666666666666130709393
+
+const a3f = -0.000195169282960705459117889f0
+const a2f =  0.00833215750753879547119141f0
+const a1f = -0.166666537523269653320312f0
+
+const b7d = -1.13615350239097429531523e-11
+const b6d =  2.08757471207040055479366e-09
+const b5d = -2.75573144028847567498567e-07
+const b4d =  2.48015872890001867311915e-05
+const b3d = -0.00138888888888714019282329
+const b2d =  0.0416666666666665519592062
+const b1d = -0.50
+
+const b5f = -2.71811842367242206819355f-07
+const b4f =  2.47990446951007470488548f-05
+const b3f = -0.00138888787478208541870117f0
+const b2f =  0.0416666641831398010253906f0
+const b1f = -0.5f0
+
+global @inline _sincos_a(x::Float64) = @horner x a1d a2d a3d a4d a5d a6d
+global @inline _sincos_a(x::Float32) = @horner x a1f a2f a3f
+global @inline _sincos_b(x::Float64) = @horner x b1d b2d b3d b4d b5d b6d b7d
+global @inline _sincos_b(x::Float32) = @horner x b1f b2f b3f b4f b5f
+
+function sincos_fast{T<:IEEEFloat}(x::T)
+    d  = abs(x)
+    q  = roundi(d*T(M2PI))
+    s  = d
+    s  = muladd(q, -PI4A(T)*2, s)
+    s  = muladd(q, -PI4B(T)*2, s)
+    s  = muladd(q, -PI4C(T)*2, s)
+    s  = muladd(q, -PI4D(T)*2, s)
+    t  = s
+    s  = s*s
+    u  =_sincos_a(s)
+    u  = u * s * t
+    rx = t + u
+    u =_sincos_b(s)
+    ry = u * s + T(1)
+    q & 1 != 0 && (s = ry; ry = rx; rx = s)
+    q & 2 != 0 && (rx = -rx)
+    (q+1) & 2 != 0 && (ry = -ry)
+    isinf(d) && (rx = ry = T(NaN))
+    convert(Tuple{T,T}, Double(flipsign(rx,x), ry))
+end
+
+function sincos{T<:IEEEFloat}(x::T)
+    d  = abs(x)
+    q  = roundi(d*2*T(M1PI))
+    s  = dsub2(d, q*PI4A(T)*2)
+    s  = dsub2(s, q*PI4B(T)*2)
+    s  = dsub2(s, q*PI4C(T)*2)
+    s  = dsub2(s, q*PI4D(T)*2)
+    t  = s
+    s  = dsqu(s)
+    sx = T(s)
+    u  =_sincos_a(sx)
+    u *= sx * t.hi
+    v  = dadd(t, u)
+    rx = T(v)
+    u  =_sincos_b(sx)
+    v  = dadd(T(1), dmul(sx, u))
+    ry = T(v)
+    q & 1 != 0 && (u = ry; ry = rx; rx = u)
+    q & 2 != 0 && (rx = -rx)
+    (q+1) & 2 != 0 && (ry = -ry)
+    isinf(d) && (rx = ry = T(NaN))
+    convert(Tuple{T,T}, Double(flipsign(rx, x), ry))
+end
+end
+
+
+"""
+    tan(x)
+
+Compute the tangent of `x`, where the output is in radians.
+"""
+function tan end
+
+"""
+    tan_fast(x)
+
+Compute the tangent of `x`, where the output is in radians.
+"""
+function tan_fast end
+
+let
+global tan
+global tan_fast
+
+const c15d =  1.01419718511083373224408e-05
+const c14d = -2.59519791585924697698614e-05
+const c13d =  5.23388081915899855325186e-05
+const c12d = -3.05033014433946488225616e-05
+const c11d =  7.14707504084242744267497e-05
+const c10d =  8.09674518280159187045078e-05
+const c9d  =  0.000244884931879331847054404
+const c8d  =  0.000588505168743587154904506
+const c7d  =  0.001456127889228124279788480
+const c6d  =  0.003592087438369066191429240
+const c5d  =  0.008863239443624016181133560
+const c4d  =  0.021869488285384638959207800
+const c3d  =  0.053968253978129841763600200
+const c2d  =  0.133333333333125941821962000
+const c1d  =  0.333333333333334980164153000
+
+const c7f =  0.00446636462584137916564941f0
+const c6f = -8.3920182078145444393158f-05
+const c5f =  0.0109639242291450500488281f0
+const c4f =  0.0212360303848981857299805f0
+const c3f =  0.0540687143802642822265625f0
+const c2f =  0.133325666189193725585938f0
+const c1f =  0.33333361148834228515625f0
+
+global @inline _tan_fast(x::Float64) = @horner_split x c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d
+global @inline _tan_fast(x::Float32) = @horner x c1f c2f c3f c4f c5f c6f c7f
+
+function tan_fast{T<:IEEEFloat}(d::T)
+    w = abs(d)
+    q = roundi(w*T(M2PI))
+    x = muladd(q, -PI4A(T)*2, w)
+    x = muladd(q, -PI4B(T)*2, x)
+    x = muladd(q, -PI4C(T)*2, x)
+    x = muladd(q, -PI4D(T)*2, x)
+    q & 1 != 0 && (x = -x)
+    s = x*x
+    u =_tan_fast(s)
+    u = muladd(s, u*x, x)
+    q & 1 != 0 && (u = 1/u)
+    isinf(w) && (u = T(NaN))
+    return flipsign(u,d)
+end
+
+global @inline _tan(x::Double{Float64}) = dadd(c1d, x.hi*(@horner_split x.hi c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d))
+global @inline _tan(x::Double{Float32}) = dadd(c1f, dmul(x, @horner x.hi c2f c3f c4f c5f c6f c7f))
+# global @inline _tan(x::Double{Float32}) = dadd(c1f, dmul(x.hi, dadd(c2f, x.hi*(@horner x.hi c3f c4f c5f c6f c7f))))
+
+function tan{T<:IEEEFloat}(d::T)
+    w = abs(d)
+    q = round(w*T(M2PI))
+    x = dsub2(w, q*PI4A(T)*2)
+    x = dsub2(x, q*PI4B(T)*2)
+    x = dsub2(x, q*PI4C(T)*2)
+    x = dsub2(x, q*PI4D(T)*2)
+    qi = unsafe_trunc(Int,q)
+    qi & 1 != 0 && (x = -x)
+    s = dsqu(x)
+    u =_tan(s)
+    u = dmul(x, dadd(T(1), dmul(u, s)))
+    qi & 1 != 0 && (u = ddrec(u))
+    return flipsign(T(u),d)
+end
+end
+
+
+
+"""
+    atan(x)
+
+Compute the inverse tangent of `x`, where the output is in radians.
+"""
+function atan{T<:IEEEFloat}(x::T)
+    u = T(atan2k(Double(abs(x)), Double(T(1))))
+    isinf(x) && (u = T(MPI2))
+    flipsign(u,x)
+end
+
+
+"""
+    atan_fast(x)
+
+Compute the inverse tangent of `x`, where the output is in radians.
+"""
+function atan_fast end
+let 
+global atan_fast
+
+const c19d = -1.88796008463073496563746e-05
+const c18d =  0.000209850076645816976906797
+const c17d = -0.00110611831486672482563471
+const c16d =  0.00370026744188713119232403
+const c15d = -0.00889896195887655491740809
+const c14d =  0.016599329773529201970117
+const c13d = -0.0254517624932312641616861
+const c12d =  0.0337852580001353069993897
+const c11d = -0.0407629191276836500001934
+const c10d =  0.0466667150077840625632675
+const c9d  = -0.0523674852303482457616113
+const c8d  =  0.0587666392926673580854313
+const c7d  = -0.0666573579361080525984562
+const c6d  =  0.0769219538311769618355029
+const c5d  = -0.090908995008245008229153
+const c4d  =  0.111111105648261418443745
+const c3d  = -0.14285714266771329383765
+const c2d  =  0.199999999996591265594148
+const c1d  = -0.333333333333311110369124
+
+const c8f =  0.00282363896258175373077393f0
+const c7f = -0.0159569028764963150024414f0
+const c6f =  0.0425049886107444763183594f0
+const c5f = -0.0748900920152664184570312f0
+const c4f =  0.106347933411598205566406f0
+const c3f = -0.142027363181114196777344f0
+const c2f =  0.199926957488059997558594f0
+const c1f = -0.333331018686294555664062f0
+
+global @inline _atan_fast(x::Float64) = @horner_split x c1d c2d c3d c4d c5d c6d c7d c8d c9d c10d c11d c12d c13d c14d c15d c16d c17d c18d c19d
+global @inline _atan_fast(x::Float32) = @horner x c1f c2f c3f c4f c5f c6f c7f c8f
+
+function atan_fast{T<:IEEEFloat}(x::T)
+    q = 0
+    if x < 0
+        x = -x
+        q = 2
+    end
+    if x > 1
+        x = 1/x
+        q |= 1
+    end
+    t = x*x
+    u =_atan_fast(t)
+    t = x + x*t*u
+    q & 1 != 0 && (t = T(MPI2) - t)
+    q & 2 != 0 && (t = -t)
+    return t
+end
+end
+
+
+
+"""
+    atan2(x, y)
+
+Compute the inverse tangent of `x/y`, using the signs of both `x` and `y` to determine the quadrant of the return value.
+"""
+function atan2{T<:IEEEFloat}(x::T, y::T)
+    r = T(atan2k(Double(abs(x)), Double(y)))
+    r = flipsign(r,y)
+    if isinf(y) || y == 0
+        r = T(MPI2) - (isinf(y) ? _sign(y)*T(MPI2) : T(0))
+    end
+    if isinf(x)
+        r = T(MPI2) - (isinf(y) ? _sign(y)*T(MPI4) : T(0))
+    end
+    if x == 0
+        r = _sign(y) == -1 ? T(MPI) : T(0)
+    end
+    return isnan(y) || isnan(x) ? T(NaN) : flipsign(r,x)
+end
+
+
+"""
+    atan2_fast(x, y)
+
+Compute the inverse tangent of `x/y`, using the signs of both `x` and `y` to determine the quadrant of the return value.
+"""
+function atan2_fast{T<:IEEEFloat}(x::T, y::T)
+    r = atan2k_fast(abs(x), y)
+    r = flipsign(r,y)
+    if isinf(y) || y == 0
+        r = T(MPI2) - (isinf(y) ? _sign(y)*T(MPI2) : T(0))
+    end
+    if isinf(x)
+        r = T(MPI2) - (isinf(y) ? _sign(y)*T(MPI4) : T(0))
+    end
+    if x == 0
+        r = _sign(y) == -1 ? T(MPI) : T(0)
+    end
+    return isnan(y) || isnan(x) ? T(NaN) : flipsign(r,x)
+end
+
+
+
+"""
+    asin(x)
+
+Compute the inverse sine of `x`, where the output is in radians.
+"""
+function asin{T<:IEEEFloat}(x::T)
+    d = atan2k(Double(abs(x)), dsqrt(dmul(dadd(T(1), x), dsub(T(1), x))))
+    u = T(d)
+    abs(x) == 1 && (u = T(MPI2))
+    flipsign(u,x)
+end
+
+
+"""
+    asin_fast(x)
+
+Compute the inverse sine of `x`, where the output is in radians.
+"""
+function asin_fast{T<:IEEEFloat}(x::T)
+    flipsign(atan2k_fast(abs(x), _sqrt((1+x)*(1-x))), x)
+end
+
+
+
+"""
+    acos(x)
+
+Compute the inverse cosine of `x`, where the output is in radians.
+"""
+function acos{T<:IEEEFloat}(x::T)
+    d = atan2k(dsqrt(dmul(dadd(T(1), x), dsub(T(1), x))), Double(abs(x)))
+    d = flipsign(d,x)
+    abs(x) == 1 && (d = Double(T(0)))
+    x < 0 && (d = dadd(MDPI(T), d))
+    return T(d)
+end
+
+
+"""
+    acos_fast(x)
+
+Compute the inverse cosine of `x`, where the output is in radians.
+"""
+function acos_fast{T<:IEEEFloat}(x::T)
+    flipsign(atan2k_fast(_sqrt((1+x)*(1-x)), abs(x)), x) + (x < 0 ? T(MPI) : T(0))
+end
diff --git a/src/utils.jl b/src/utils.jl
new file mode 100644
index 0000000..9f76db7
--- /dev/null
+++ b/src/utils.jl
@@ -0,0 +1,63 @@
+## utility functions mainly used by the private math functions in priv.jl
+
+function is_fma_fast end
+for T in (Float32, Float64)
+    @eval is_fma_fast(::Type{$T}) = $(muladd(nextfloat(one(T)),nextfloat(one(T)),-nextfloat(one(T),2)) != zero(T))
+end
+const FMA_FAST = is_fma_fast(Float64) && is_fma_fast(Float32)
+
+
+@inline exponent_max{T<:IEEEFloat}(::Type{T}) = Int(exponent_mask(T) >> significand_bits(T))
+
+# _sign emits better native code than sign but does not properly handle the Inf/NaN cases
+@inline _sign{T<:IEEEFloat}(d::T) = flipsign(one(T), d)
+
+@inline roundi{T<:IEEEFloat}(x::T) = unsafe_trunc(Int, round(x))
+
+@inline integer2float(::Type{Float64}, m::Int) = reinterpret(Float64, (m % Int64) << significand_bits(Float64))
+@inline integer2float(::Type{Float32}, m::Int) = reinterpret(Float32, (m % Int32) << significand_bits(Float32))
+
+@inline float2integer(d::Float64) = (reinterpret(Int64, d) >> significand_bits(Float64)) % Int
+@inline float2integer(d::Float32) = (reinterpret(Int32, d) >> significand_bits(Float32)) % Int
+
+@inline pow2i{T<:IEEEFloat}(::Type{T}, q::Int) = integer2float(T, q + exponent_bias(T))
+
+# sqrt without the domain checks which we don't need since we handle the checks ourselves
+_sqrt{T<:IEEEFloat}(x::T) = Base.sqrt_llvm_fast(x)
+
+@inline ispinf{T<:IEEEFloat}(x::T) = x ==  T(Inf)
+@inline isninf{T<:IEEEFloat}(x::T) = x == -T(Inf)
+
+# Similar to @horner, but split into even and odd coefficients. This is typically less
+# accurate, but faster due to out of order execution.
+macro horner_split(x,p...)
+    t1 = gensym("x1")
+    t2 = gensym("x2")
+    blk = quote
+        $t1 = $(esc(x))
+        $t2 = $(esc(x)) * $(esc(x))
+    end
+    n = length(p)
+    p0 = esc(p[1])
+    if isodd(n)
+        ex_o = esc(p[end-1])
+        ex_e = esc(p[end])
+        for i = n-3:-2:2
+            ex_o = :(muladd($(t2), $ex_o, $(esc(p[i]))))
+        end
+        for i = n-2:-2:2
+            ex_e = :(muladd($(t2), $ex_e, $(esc(p[i]))))
+        end
+    elseif iseven(n)
+        ex_o = esc(p[end])
+        ex_e = esc(p[end-1])
+        for i = n-2:-2:2
+            ex_o = :(muladd($(t2), $ex_o, $(esc(p[i]))))
+        end
+        for i = n-3:-2:2
+            ex_e = :(muladd($(t2), $ex_e, $(esc(p[i]))))
+        end
+    end
+    push!(blk.args,:($(p0) + $(t1)*$(ex_o) + $(t2)*$(ex_e)))
+    blk
+end
diff --git a/test/accuracy.jl b/test/accuracy.jl
new file mode 100644
index 0000000..c7e335b
--- /dev/null
+++ b/test/accuracy.jl
@@ -0,0 +1,177 @@
+
+MRANGE(::Type{Float64}) = 10000000
+MRANGE(::Type{Float32}) = 10000
+IntF(::Type{Float64}) = Int64
+IntF(::Type{Float32}) = Int32
+
+
+@testset "Accuracy (max error in ulp) for $T" for T in (Float64, Float32)
+    println("Accuracy tests for $T")
+
+
+    xx = map(T, vcat(-10:0.0002:10, -1000:0.1:1000))
+    fun_table = Dict(Sleef.exp => Base.exp)
+    tol = 1
+    test_acc(T, fun_table, xx, tol)
+
+
+
+    # xx = map(T, vcat(-10:0.0002:10, -1000:0.02:1000))
+    # fun_table = Dict(Sleef.asinh => Base.asinh, Sleef.atanh => Base.atanh)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = map(T, vcat(1:0.0002:10, 1:0.02:1000))
+    # fun_table = Dict(Sleef.acosh => Base.acosh)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = T[]
+    # for i = 1:10000
+    #     s = reinterpret(T, reinterpret(IntF(T), T(pi)/4 * i) - IntF(T)(20))
+    #     e = reinterpret(T, reinterpret(IntF(T), T(pi)/4 * i) + IntF(T)(20))
+    #     d = s
+    #     while d <= e 
+    #         append!(xx, d)
+    #         d = reinterpret(T, reinterpret(IntF(T), d) + IntF(T)(1))
+    #     end
+    # end
+    # xx = append!(xx, -10:0.0002:10)
+    # xx = append!(xx, -MRANGE(T):200.1:MRANGE(T))
+
+    # fun_table = Dict(Sleef.sin => Base.sin, Sleef.cos => Base.cos, Sleef.tan => Base.tan)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+    # fun_table = Dict(Sleef.sin_fast => Base.sin, Sleef.cos_fast => Base.cos, Sleef.tan_fast => Base.tan)
+    # tol = 4
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # sin_sincos_fast(x) = Sleef.sincos_fast(x)[1]
+    # cos_sincos_fast(x) = Sleef.sincos_fast(x)[2]
+    # fun_table = Dict(sin_sincos_fast => Base.sin, cos_sincos_fast => Base.cos)
+    # tol = 4
+    # test_acc(T, fun_table, xx, tol) 
+
+    # sin_sincos(x) = Sleef.sincos(x)[1]
+    # cos_sincos(x) = Sleef.sincos(x)[2]
+    # fun_table = Dict(sin_sincos => Base.sin, cos_sincos => Base.cos)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol) 
+    
+    
+    # xx = map(T, vcat(-1:0.00002:1))
+    # fun_table = Dict(Sleef.asin_fast => Base.asin, Sleef.acos_fast => Base.acos)
+    # tol = 3
+    # test_acc(T, fun_table, xx, tol)
+
+    # fun_table = Dict(Sleef.asin => asin, Sleef.acos => Base.acos)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = map(T, vcat(-10:0.0002:10, -10000:0.2:10000, -10000:0.201:10000))
+    # fun_table = Dict(Sleef.atan_fast => Base.atan)
+    # tol = 3
+    # test_acc(T, fun_table, xx, tol)
+
+    # fun_table = Dict(Sleef.atan => Base.atan)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx1 = map(Tuple{T,T}, [zip(-10:0.050:10, -10:0.050:10)...])
+    # xx2 = map(Tuple{T,T}, [zip(-10:0.051:10, -10:0.052:10)...])
+    # xx3 = map(Tuple{T,T}, [zip(-100:0.51:100, -100:0.51:100)...])
+    # xx4 = map(Tuple{T,T}, [zip(-100:0.51:100, -100:0.52:100)...])
+    # xx = vcat(xx1, xx2, xx3, xx4)
+
+    # fun_table = Dict(Sleef.atan2_fast => Base.atan2)
+    # tol = 2.5
+    # test_acc(T, fun_table, xx, tol)
+
+    # fun_table = Dict(Sleef.atan2 => Base.atan2)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = map(T, vcat(0.0001:0.0001:10, 0.001:0.1:10000, 1.1.^(-1000:1000), 2.1.^(-1000:1000)))
+    # fun_table = Dict(Sleef.log_fast => Base.log)
+    # tol = 3
+    # test_acc(T, fun_table, xx, tol)
+
+    # fun_table = Dict(Sleef.log => Base.log)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = map(T, vcat(0.0001:0.0001:10, 0.0001:0.1:10000))
+    # fun_table = Dict(Sleef.log10 => Base.log10, Sleef.log2 => Base.log2)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = map(T, vcat(0.0001:0.0001:10, 0.0001:0.1:10000, 10.0.^-(0:0.02:300), -10.0.^-(0:0.02:300)))
+    # fun_table = Dict(Sleef.log1p => Base.log1p)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx1 = map(Tuple{T,T}, [(x,y) for x = -100:0.20:100, y = 0.1:0.20:100])[:]
+    # xx2 = map(Tuple{T,T}, [(x,y) for x = -100:0.21:100, y = 0.1:0.22:100])[:]
+    # xx3 = map(Tuple{T,T}, [(x,y) for x = 2.1, y = -1000:0.1:1000])
+    # xx = vcat(xx1, xx2, xx2)
+    # fun_table = Dict(Sleef.pow => Base.:^)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = map(T, vcat(-10000:0.2:10000, 1.1.^(-1000:1000), 2.1.^(-1000:1000)))
+    # fun_table = Dict(Sleef.cbrt_fast => Base.cbrt)
+    # tol = 2
+    # test_acc(T, fun_table, xx, tol)
+
+    # fun_table = Dict(Sleef.cbrt => Base.cbrt)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+
+    # xx = map(T, vcat(-10:0.0002:10, -120:0.023:1000, -1000:0.02:2000))
+    # fun_table = Dict(Sleef.exp2 => Base.exp2)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = map(T, vcat(-10:0.0002:10, -35:0.023:1000, -300:0.01:300))
+    # fun_table = Dict(Sleef.exp10 => Base.exp10)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = map(T, vcat(-10:0.0002:10, -1000:0.021:1000, -1000:0.023:1000, 
+    #     10.0.^-(0:0.02:300), -10.0.^-(0:0.02:300), 10.0.^(0:0.021:300), -10.0.^-(0:0.021:300)))
+    # fun_table = Dict(Sleef.expm1 => Base.expm1)
+    # tol = 2
+    # test_acc(T, fun_table, xx, tol)
+
+
+    # xx = map(T, vcat(-10:0.0002:10, -1000:0.02:1000))   
+    # fun_table = Dict(Sleef.sinh => Base.sinh, Sleef.cosh => Base.cosh, Sleef.tanh => Base.tanh)
+    # tol = 1
+    # test_acc(T, fun_table, xx, tol)
+
+
+
+     @testset "xilog2 at arbitrary values" begin
+        xd = Dict{T,Int}(T(1e-30) => -100, T(2.31e-11) => -36, T(-1.0) => 0, T(1.0) => 0, 
+                    T(2.31e11) => 37,  T(1e30) => 99)
+        for (i,j) in xd
+            @test Sleef.ilog2(i)  === j
+        end
+    end
+    
+end
diff --git a/test/dnml_nan.jl b/test/dnml_nan.jl
new file mode 100644
index 0000000..4dd1a24
--- /dev/null
+++ b/test/dnml_nan.jl
@@ -0,0 +1,437 @@
+@testset "denormal/nonnumber $T" for T in (Float32, Float64)
+
+@testset "denormal/nonnumber $xatan2" for xatan2 in (Sleef.atan2_fast, Sleef.atan2)
+
+    @test xatan2(T(0.0),  T(-0.0)) === T(pi)
+    @test xatan2(T(-0.0), T(-0.0)) === -T(pi)
+    @test ispzero(xatan2(T(0.0), T(0.0)))
+    @test isnzero(xatan2(T(-0.0), T(0.0)))
+    @test xatan2( T(Inf), -T(Inf)) === T(3*pi/4)
+    @test xatan2(-T(Inf), -T(Inf)) === T(-3*pi/4)
+    @test xatan2( T(Inf),  T(Inf))  === T(pi/4)
+    @test xatan2(-T(Inf),  T(Inf))  === T(-pi/4)
+
+
+    y = T(0.0)
+    xa = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5]
+    for x in xa
+        @test xatan2(y,x) === T(pi)
+    end
+
+
+    y = T(-0.0)
+    xa = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5]
+    for x in xa
+        @test xatan2(y,x) === T(-pi)
+    end
+
+
+    ya = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5]
+    xa = T[T(0.0), T(-0.0)]
+    for x in xa, y in ya
+        @test xatan2(y,x) === T(-pi/2)
+    end
+
+
+    ya = T[100000.5, 100000, 3, 2.5, 2, 1.5, 1.0, 0.5]
+    xa = T[T(0.0), T(-0.0)]
+    for x in xa, y in ya
+        @test xatan2(y,x) === T(pi/2)
+    end
+
+
+    y = T(Inf)
+    xa = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5, -0.0, +0.0, 0.5, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5]
+    for x in xa
+        @test xatan2(y,x) === T(pi/2)
+    end
+
+
+    y = T(-Inf)
+    xa = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5, -0.0, +0.0, 0.5, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5]
+    for x in xa
+        @test xatan2(y,x) === T(-pi/2)
+    end
+
+
+    ya = T[0.5, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5]
+    x = T(Inf)
+    for y in ya
+        @test ispzero(xatan2(y,x))
+    end
+
+
+    ya = T[-0.5, -1.5, -2.0, -2.5, -3.0, -100000, -100000.5]
+    x = T(Inf)
+    for y in ya
+        @test isnzero(xatan2(y,x))
+    end
+
+
+    ya = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5, -0.0, +0.0, 0.5, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5, NaN]
+    x = T(NaN)
+    for y in ya
+        @test isnan(xatan2(y,x))
+    end
+
+
+    y = T(NaN)
+    xa = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5, -0.0, +0.0, 0.5, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5, NaN]
+    for x in xa
+        @test isnan(xatan2(y,x))
+    end
+
+end # denormal/nonumber atan2
+
+
+@testset "denormal/nonnumber xpow" begin
+
+    @test Sleef.pow(T(1), T(NaN))   === T(1)
+    @test Sleef.pow( T(NaN), T(0))  === T(1)
+    @test Sleef.pow(-T(1), T(Inf))  === T(1)
+    @test Sleef.pow(-T(1), T(-Inf)) === T(1)
+
+
+    xa = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5]
+    ya = T[-100000.5, -2.5, -1.5, -0.5, 0.5, 1.5, 2.5, 100000.5]
+    for x in xa, y in ya
+        @test isnan(Sleef.pow(x,y))
+    end
+
+
+    x = T(NaN)
+    ya = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5]
+    for y in ya
+        @test isnan(Sleef.pow(x,y))
+    end
+
+
+    xa = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5, -0.0, +0.0, 0.5, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5]
+    y = T(NaN)
+    for x in xa
+        @test isnan(Sleef.pow(x,y))
+    end
+
+
+    x = T(0.0)
+    ya = T[1, 3, 5, 7, 100001]
+    for y in ya
+        @test ispzero(Sleef.pow(x,y))
+    end
+
+
+    x = T(-0.0)
+    ya = T[1, 3, 5, 7, 100001]
+    for y in ya
+        @test isnzero(Sleef.pow(x,y))
+    end
+
+
+    xa = T[0.0, -0.0]
+    ya = T[0.5, 1.5, 2.0, 2.5, 4.0, 100000, 100000.5]
+    for x in xa, y in ya
+        @test ispzero(Sleef.pow(x,y))
+    end
+
+
+    xa = T[-0.999, -0.5, -0.0, +0.0, +0.5, +0.999]
+    y = T(-Inf)
+    for x in xa
+        @test Sleef.pow(x,y) === T(Inf)
+    end
+
+
+    xa = T[-100000.5, -100000, -3, -2.5, -2, -1.5, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5]
+    y = T(-Inf)
+    for x in xa
+        @test ispzero(Sleef.pow(x,y))
+    end
+
+
+    xa = T[-0.999, -0.5, -0.0, +0.0, +0.5, +0.999]
+    y = T(Inf)
+    for x in xa
+        @test ispzero(Sleef.pow(x,y))
+    end
+
+
+    xa = T[-100000.5, -100000, -3, -2.5, -2, -1.5, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5]
+    y = T(Inf)
+    for x in xa
+        @test Sleef.pow(x,y) === T(Inf)
+    end
+
+
+    x = T(-Inf)
+    ya = T[-100001, -5, -3, -1]
+    for y in ya
+        @test isnzero(Sleef.pow(x,y))
+    end
+
+
+    x = T(-Inf)
+    ya = T[-100000.5, -100000, -4, -2.5, -2, -1.5, -0.5]
+    for y in ya
+        @test ispzero(Sleef.pow(x,y))
+    end
+
+
+    x = T(-Inf)
+    ya = T[1, 3, 5, 7, 100001]
+    for y in ya
+        @test Sleef.pow(x,y) === T(-Inf)
+    end
+
+
+    x = T(-Inf)
+    ya = T[0.5, 1.5, 2, 2.5, 3.5, 4, 100000, 100000.5]
+    for y in ya
+        @test Sleef.pow(x,y) === T(Inf)
+    end
+
+
+    x = T(Inf)
+    ya = T[-100000.5, -100000, -3, -2.5, -2, -1.5, -1.0, -0.5]
+    for y in ya
+        @test ispzero(Sleef.pow(x,y))
+    end
+
+
+    x = T(Inf)
+    ya = T[0.5, 1, 1.5, 2.0, 2.5, 3.0, 100000, 100000.5]
+    for y in ya
+        @test Sleef.pow(x,y) === T(Inf)
+    end
+
+
+    x = T(0.0)
+    ya = T[-100001, -5, -3, -1]
+    for y in ya
+        @test Sleef.pow(x,y) ===  T(Inf)
+    end
+
+
+    x = T(-0.0)
+    ya = T[-100001, -5, -3, -1]
+    for y in ya
+        @test Sleef.pow(x,y) ===  T(-Inf)
+    end
+
+
+    xa = T[0.0, -0.0]
+    ya = T[-100000.5, -100000, -4, -2.5, -2, -1.5, -0.5]
+    for x in xa, y in ya
+        @test Sleef.pow(x,y) === T(Inf)
+    end
+
+
+    xa = T[1000, -1000]
+    ya = T[1000, 1000.5, 1001]
+    for x in xa, y in ya
+        @test cmpdenorm(Sleef.pow(x,y), Base.:^(BigFloat(x),BigFloat(y)))
+    end
+
+end # denormal/nonumber pow
+
+
+fun_table = Dict(Sleef.sin_fast => Base.sin, Sleef.sin => Base.sin)
+@testset "denormal/nonnumber $xtrig" for (xtrig, trig) in fun_table
+    xa = T[NaN, -0.0, 0.0, Inf, -Inf]
+    for x in xa
+        @test cmpdenorm(xtrig(x), trig(BigFloat(x)))
+    end
+end
+
+
+fun_table = Dict(Sleef.cos_fast => Base.cos, Sleef.cos => Base.cos)
+@testset "denormal/nonnumber $xtrig" for (xtrig, trig) in fun_table
+    xa = T[NaN, Inf, -Inf]
+    for x in xa
+        @test cmpdenorm(xtrig(x), trig(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber sin in $xsincos"for xsincos in (Sleef.sincos_fast, Sleef.sincos)
+    xa = T[NaN, -0.0, 0.0, Inf, -Inf]
+    for x in xa
+        q = xsincos(x)[1]
+        @test cmpdenorm(q, Base.sin(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber cos in $xsincos"for xsincos in (Sleef.sincos_fast, Sleef.sincos)
+    xa = T[NaN, Inf, -Inf]
+    for x in xa
+        q = xsincos(x)[2]
+        @test cmpdenorm(q, Base.cos(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber $xtan" for xtan in (Sleef.tan_fast, Sleef.tan)
+    xa = T[NaN, Inf, -Inf, -0.0, 0.0, pi/2, -pi/2]
+    for x in xa
+        @test cmpdenorm(xtan(x), Base.tan(BigFloat(x)))
+    end
+end
+
+
+fun_table = Dict(Sleef.asin => Base.asin, Sleef.asin_fast => Base.asin, Sleef.acos => Base.acos, Sleef.acos_fast => Base.acos)
+@testset "denormal/nonnumber $xatrig" for (xatrig, atrig) in fun_table
+    xa = T[NaN, Inf, -Inf, 2, -2, 1, -1]
+    for x in xa
+        @test cmpdenorm(xatrig(x), atrig(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber $xatan" for xatan in (Sleef.atan, Sleef.atan_fast)
+    xa = T[NaN, Inf, -Inf]
+    for x in xa
+        @test cmpdenorm(xatan(x), Base.atan(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber exp" begin
+    xa = T[NaN, Inf, -Inf, 10000, -10000]
+    for x in xa
+        @test cmpdenorm(Sleef.exp(x), Base.exp(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber sinh" begin
+    xa = T[NaN, 0.0, -0.0, Inf, -Inf, 10000, -10000]
+    for x in xa
+        @test cmpdenorm(Sleef.sinh(x), Base.sinh(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber cosh" begin
+    xa = T[NaN, 0.0, -0.0, Inf, -Inf, 10000, -10000]
+    for x in xa
+        @test cmpdenorm(Sleef.cosh(x),  Base.cosh(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber tanh" begin
+    xa = T[NaN, 0.0, -0.0, Inf, -Inf, 10000, -10000]
+    for x in xa
+        @test cmpdenorm(Sleef.tanh(x),  Base.tanh(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber asinh" begin
+    xa = T[NaN, 0.0, -0.0, Inf, -Inf, 10000, -10000]
+    for x in xa
+        @test cmpdenorm(Sleef.asinh(x), Base.asinh(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber acosh" begin
+    xa = T[NaN, 0.0, -0.0, 1.0, Inf, -Inf, 10000, -10000]
+    for x in xa
+        @test cmpdenorm(Sleef.acosh(x), Base.acosh(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber atanh" begin
+    xa = T[NaN, 0.0, -0.0, 1.0, -1.0, Inf, -Inf, 10000, -10000]
+    for x in xa
+        @test cmpdenorm(Sleef.atanh(x), Base.atanh(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber $xcbrt" for xcbrt = (Sleef.cbrt, Sleef.cbrt_fast)
+    xa = T[NaN, Inf, -Inf, 0.0, -0.0]
+    for x in xa
+        @test cmpdenorm(Sleef.cbrt(x), Base.cbrt(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber exp2" begin
+    xa = T[NaN, Inf, -Inf]
+    for x in xa
+        @test cmpdenorm(Sleef.exp2(x), Base.exp2(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber exp10" begin
+    xa = T[NaN, Inf, -Inf]
+    for x in xa
+        @test cmpdenorm(Sleef.exp10(x), Base.exp10(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber expm1" begin
+    xa = T[NaN, Inf, -Inf]
+    for x in xa
+        @test cmpdenorm(Sleef.expm1(x), Base.expm1(BigFloat(x)))
+    end
+end
+
+
+
+@testset "denormal/nonnumber $xlog" for xlog in (Sleef.log, Sleef.log_fast)
+    xa = T[NaN, Inf, -Inf, 0, -1]
+    for x in xa
+        @test cmpdenorm(xlog(x), Base.log(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber log10" begin
+    xa = T[NaN, Inf, -Inf, 0, -1]
+    for x in xa
+        @test cmpdenorm(Sleef.log10(x), Base.log10(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber log2" begin
+    xa = T[NaN, Inf, -Inf, 0, -1]
+    for x in xa
+        @test cmpdenorm(Sleef.log2(x), Base.log2(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber log1p" begin
+    xa = T[NaN, Inf, -Inf, 0.0, -0.0, -1.0, -2.0]
+    for x in xa
+        @test cmpdenorm(Sleef.log1p(x), Base.log1p(BigFloat(x)))
+    end
+end
+
+
+@testset "denormal/nonnumber ldexp" begin
+    for i = -10000:10000
+        a = Sleef.ldexp(T(1.0), i)
+        b = Base.ldexp(BigFloat(1.0), i)
+        @test (isfinite(b) && a == b || cmpdenorm(a,b))
+    end
+end
+
+
+@testset "denormal/nonnumber ilog2" begin
+    @test Sleef.ilog2(+T(Inf)) == typemax(Int)
+    @test Sleef.ilog2(-T(Inf)) == typemax(Int)
+    @test Sleef.ilog2(+T(0.0)) == typemin(Int)
+    @test Sleef.ilog2(-T(0.0)) == typemin(Int)
+    @test Sleef.ilog2( T(NaN)) == typemax(Int)
+end
+
+
+end #denormal/nonnumber
diff --git a/test/runtests.jl b/test/runtests.jl
new file mode 100644
index 0000000..166f6c9
--- /dev/null
+++ b/test/runtests.jl
@@ -0,0 +1,115 @@
+using Sleef
+using Base.Test
+
+isnzero{T<:AbstractFloat}(x::T) = signbit(x)
+ispzero{T<:AbstractFloat}(x::T) = !signbit(x)
+
+function cmpdenorm{Tx<:AbstractFloat, Ty<:AbstractFloat}(x::Tx, y::Ty)
+    sizeof(Tx) < sizeof(Ty) ? y = Tx(y) : x = Ty(x) # cast larger type to smaller type
+    (isnan(x) && isnan(y)) && return true
+    (isnan(x) || isnan(y)) && return false
+    (isinf(x) != isinf(y)) && return false
+    (x == Tx(Inf)  && y == Ty(Inf))  && return true
+    (x == Tx(-Inf) && y == Ty(-Inf)) && return true
+    if y == 0
+        (ispzero(x) && ispzero(y)) && return true
+        (isnzero(x) && isnzero(y)) && return true
+        return false
+    end
+    (!isnan(x) && !isnan(y) && !isinf(x) && !isinf(y)) && return sign(x) == sign(y)
+    return false
+end
+
+# the following compares the ulp between x and y.
+# First it promotes them to the larger of the two types x,y
+infh(::Type{Float64}) = 1e+300
+infh(::Type{Float32}) = 1e+37
+function countulp(T, x::AbstractFloat, y::AbstractFloat)
+    X,Y = promote(x,y)
+    x, y = T(X), T(Y) # Cast to smaller type
+    (isnan(x) && isnan(y)) && return 0
+    (isnan(x) || isnan(y)) && return 10000
+    if isinf(x)
+        (sign(x) == sign(y) && abs(y) > infh(T)) && return 0 # Relaxed infinity handling
+        return 10001
+    end
+    (x ==  Inf && y ==  Inf) && return 0
+    (x == -Inf && y == -Inf) && return 0
+    if y == 0
+        (x == 0) && return 0
+        return 10002
+    end
+    if isfinite(x) && isfinite(y)
+        return T(abs(X - Y)/eps(y))
+    end
+    return 10003
+end
+countulp{T<:AbstractFloat}(x::T, y::T) = countulp(T,x,y)
+
+# get rid off annoying warnings from overwritten function
+macro nowarn(expr)
+    quote
+        stderr = STDERR
+        stream = open("null", "w")
+        redirect_stderr(stream)
+        result = $(esc(expr))
+        redirect_stderr(stderr)
+        close(stream)
+        result
+    end
+end
+
+# overide domain checking that base adheres to
+using Base.MPFR.ROUNDING_MODE
+for f in (:sin, :cos, :tan, :asin, :acos, :atan, :asinh, :acosh, :atanh, :log, :log10, :log2, :log1p)
+    @eval begin
+        import Base.$f
+        @nowarn function ($f)(x::BigFloat)
+            z = BigFloat()
+            ccall($(string(:mpfr_,f), :libmpfr), Int32, (Ptr{BigFloat}, Ptr{BigFloat}, Int32), &z, &x, ROUNDING_MODE[])
+            return z
+        end
+    end
+end
+
+strip_module_name(f::Function) = last(split(string(f), '.')) # strip module name from function f
+
+# test the accuracy of a function where fun_table is a Dict mapping the function you want
+# to test to a reference function
+# xx is an array of values (which may be tuples for multiple arugment functions)
+# tol is the acceptable tolerance to test against
+function test_acc(T, fun_table, xx, tol; debug=false, tol_debug=5)
+    @testset "accuracy $(strip_module_name(xfun))" for (xfun, fun) in fun_table
+        rmax = 0.0
+        rmean = 0.0
+        xmax = map(zero, first(xx))
+        for x in xx
+            q = xfun(x...)
+            c = fun(map(BigFloat,x)...)
+            u = countulp(T,q,c)
+            rmax = max(rmax, u)
+            xmax = rmax == u ? x : xmax
+            rmean += u
+            if debug && u > tol_debug
+                @printf("%s = %.20g\n%s  = %.20g\nx = %.20g\nulp = %g\n", strip_module_name(xfun), q, strip_module_name(fun), T(c), x, ulp(T(c)))
+            end
+        end
+        rmean = rmean/length(xx)
+
+        t = @test trunc(rmax,1) <= tol
+
+        fmtxloc = isa(xmax, Tuple) ? string('(', join((@sprintf("%.5f", x) for x in xmax), ", "), ')') : @sprintf("%.5f", xmax)
+        println(rpad(strip_module_name(xfun), 18, " "), ": max ", @sprintf("%f", rmax),
+            rpad(" at x = "*fmtxloc, 40, " "),
+            ": mean ", @sprintf("%f", rmean))
+    end
+end
+
+function runtests()
+    @testset "Sleef" begin
+        include(joinpath(@__DIR__,"dnml_nan.jl"))
+        include(joinpath(@__DIR__,"accuracy.jl"))
+    end
+end
+
+runtests()