Merge pull request #67 from milankl/mk/subnormals

Float16 stochastic round of subnormals
milankl · Jul 10, 2023 · 8033db4 · 8033db4
2 parents c801a4f + f8aef51
commit 8033db4
Show file tree

Hide file tree

Showing 4 changed files with 134 additions and 27 deletions.
diff --git a/src/float16sr.jl b/src/float16sr.jl
@@ -55,26 +55,55 @@ BFloat16(x::Float16sr) = BFloat16(Float16(x))
 Float16sr(x::Integer) = Float16sr(Float32(x))
 (::Type{T})(x::Float16sr) where {T<:Integer} = T(Float32(x))
 
-const eps_F16 = prevfloat(Float32(nextfloat(zero(Float16))))
+"""
+    rand_subnormal(rbits::UInt32) -> Float32
+
+Create a random perturbation for the Float16 subnormals for
+stochastic rounding of Float32 -> Float16.
+This function samples uniformly from [-2.980232f-8,2.9802319f-8].
+This function is algorithmically similar to randfloat from RandomNumbers.jl"""
+function rand_subnormal(rbits::UInt32)
+    lz = leading_zeros(rbits)   # count leading zeros for correct probabilities of exponent
+    e = ((101 - lz) % UInt32) << 23
+    e |= (rbits << 31)          # use last bit for sign
+
+    # combine exponent with random mantissa
+    return reinterpret(Float32,e | (rbits & 0x007f_ffff))
+end
+
+const eps_F16 = prevfloat(Float32(nextfloat(zero(Float16))),1)
 const floatmin_F16 = Float32(floatmin(Float16))
 const oneF32 = reinterpret(Int32,one(Float32))
 
-"""Stochastically round x::Float32 to Float16 with distance-proportional probabilities."""
+# # old version 
+# function rand_subnormal(rbits::UInt32)
+#     return eps_F16*(reinterpret(Float32,oneF32 | (rbits >> 9))-1.5f0)
+# end
+
+"""
+    Float16_stochastic_round(x::Float32) -> Float16sr
+
+Stochastically round `x` to Float16 with distance-proportional probabilities."""
 function Float16_stochastic_round(x::Float32)
     rbits = rand(Xor128[],UInt32)   # create random bits
 
     # subnormals are rounded with float-arithmetic for uniform stoch perturbation
-    abs(x) < floatmin_F16 && return Float16sr(x+eps_F16*(reinterpret(Float32,oneF32 | (rand(Xor128[],UInt32) >> 9))-1.5f0))
-
+    abs(x) < floatmin_F16 && return Float16sr(x+rand_subnormal(rbits))
+
+    # normals are stochastically rounded with integer arithmetic
     ui = reinterpret(UInt32,x)
-    ui += (rbits & 0x0000_1fff)     # add perturbation in [0,u)
-    ui &= 0xffff_e000               # round to zero
+    mask = 0x0000_1fff          # only mantissa bit 11-23 (the non-Float16 ones)
+    ui += (rbits & mask)        # add perturbation in [0,u)
+    ui &= ~mask                 # round to zero
 
-    # convert to Float16 to adjust exponent bits
-    return reinterpret(Float16sr,Float16(reinterpret(Float32,ui)))  
+    # via conversion to Float16 to adjust exponent bits
+    return Float16sr(reinterpret(Float32,ui))
 end
 
-"""Chance that x::Float32 is round up when converted to Float16sr."""
+"""
+    Float16_chance_roundup(x::Float32)
+
+Chance that x::Float32 is round up when converted to Float16sr."""
 function Float16_chance_roundup(x::Float32)
     xround = Float32(Float16(x))
     xround == x && return zero(Float32)

diff --git a/src/float32sr.jl b/src/float32sr.jl
@@ -58,24 +58,53 @@ Float32sr(x::Integer) = Float32sr(Float32(x))
 Float32sr(x::BFloat16) = Float32sr(Float64(x))
 BFloat16(x::Float32sr) = BFloat16(Float32(x))
 
+"""
+    rand_subnormal(rbits::UInt64) -> Float64
+
+Create a random perturbation for the Float16 subnormals for
+stochastic rounding of Float32 -> Float16.
+This function samples uniformly from [-7.0064923216240846e-46,7.006492321624084e-46].
+This function is algorithmically similar to randfloat from RandomNumbers.jl"""
+function rand_subnormal(rbits::UInt64)
+    lz = leading_zeros(rbits)   # count leading zeros for probabilities of exponent
+    e = ((872 - lz) % UInt64) << 52
+    e |= (rbits << 63)          # use last bit for sign
+
+    # combine exponent with random mantissa
+    return reinterpret(Float64,e | (rbits & Base.significand_mask(Float64)))
+end
+
 const eps_F32 = prevfloat(Float64(nextfloat(zero(Float32))))
 const floatmin_F32 = Float64(floatmin(Float32))
 const oneF64 = reinterpret(Int64,one(Float64))
 
-"""Stochastically round x::Float64 to Float32 with distance-proportional probabilities."""
+# old version
+# function rand_subnormal(rbits::UInt64)
+#     return eps_F32*(reinterpret(Float64,oneF64 | (rbits >> 12))-1.5)
+# end
+
+"""
+    Float32_stochastic_round(x::Float64) -> Float32sr
+
+Stochastically round x::Float64 to Float32 with distance-proportional probabilities."""
 function Float32_stochastic_round(x::Float64)
     rbits = rand(Xor128[],UInt64)               # create random bits
 
     # subnormals are rounded with float-arithmetic for uniform stoch perturbation
-    abs(x) < floatmin_F32 && return Float32sr(x+eps_F32*(reinterpret(Float64,oneF64 | (rbits >> 12))-1.5))
+    abs(x) < floatmin_F32 && return Float32sr(x+rand_subnormal(rbits))
 
+    # normals with integer arithmetic
     ui = reinterpret(UInt64,x)
-    ui += (rbits & 0x0000_0000_1fff_ffff)       # add stochastic perturbation in [0,u)
-    ui &= 0xffff_ffff_e000_0000                 # round to zero 
-    return reinterpret(Float32sr,Float32(reinterpret(Float64,ui)))
+    mask = 0x0000_0000_1fff_ffff
+    ui += (rbits & mask)        # add stochastic perturbation in [0,u)
+    ui &= ~mask                 # round to zero 
+    return Float32sr(reinterpret(Float64,ui))
 end
 
-"""Chance that x::Float64 is round up when converted to Float32sr."""
+"""
+    Float32_chance_roundup(x::Float64)
+
+Chance that `x` is round up when converted to Float32sr."""
 function Float32_chance_roundup(x::Float64)
     xround = Float64(Float32(x))
     xround == x && return zero(Float64)

diff --git a/test/float16sr.jl b/test/float16sr.jl
@@ -47,7 +47,7 @@ end
     @test o == prevfloat(nextfloat(o))
 end
 
-@testset "Odd floats are never round away" begin
+@testset "Odd floats are never rounded away" begin
     # chances are 2^-29 = 2e-9
     # so technically one should use N = 100_000_000
     N = 100_000 
@@ -77,7 +77,11 @@ function test_chances_round(f32::Float32;N::Int=100_000)
     p = Float16_chance_roundup(f32)
 
     f16_round = Float16sr(f32)
-    if Float32(f16_round) <= f32
+
+    if Float32(f16_round) == f32
+        f16_roundup=f16_round
+        f16_rounddown=f16_round
+    elseif Float32(f16_round) < f32
         f16_rounddown = f16_round
         f16_roundup = nextfloat(f16_round)
     else
@@ -101,19 +105,27 @@ end
 
 @testset "Test for N(0,1)" begin
     for x in randn(Float32,10_000)
-        Ndown,Nup,N,p = test_chances_round(x)
-        @test Ndown + Nup == N
-        @test isapprox(Ndown/N,1-p,atol=2e-2)
-        @test isapprox(Nup/N,p,atol=2e-2)
+        # exclude subnormals from testing, they are tested further down
+        if ~issubnormal(Float16(x))
+            Ndown,Nup,N,p = test_chances_round(x)
+            @test Ndown + Nup == N
+            Ndown + Nup != N && @info "Test failed for $x, $(bitstring(Float16(x)))"
+            @test isapprox(Ndown/N,1-p,atol=2e-2)
+            @test isapprox(Nup/N,p,atol=2e-2)
+        end
     end
 end
 
 @testset "Test for U(0,1)" begin
     for x in rand(Float32,10_000)
-        Ndown,Nup,N,p = test_chances_round(x)
-        @test Ndown + Nup == N
-        @test isapprox(Ndown/N,1-p,atol=2e-2)
-        @test isapprox(Nup/N,p,atol=2e-2)
+        # exclude subnormals from testing, they are tested further down
+        if ~issubnormal(Float16(x))
+            Ndown,Nup,N,p = test_chances_round(x)
+            @test Ndown + Nup == N
+            Ndown + Nup != N && @info "Test failed for $x, $(bitstring(Float16(x)))"
+            @test isapprox(Ndown/N,1-p,atol=2e-2)
+            @test isapprox(Nup/N,p,atol=2e-2)
+        end
     end
 end
 
@@ -157,15 +169,49 @@ end
     for x in subnormals
         Ndown,Nup,N,p = test_chances_round(x)
         @test Ndown + Nup == N
+        Ndown + Nup != N && @info "Test failed for $x, $(bitstring(Float16(x)))"
         @test isapprox(Ndown/N,1-p,atol=2e-2)
         @test isapprox(Nup/N,p,atol=2e-2)
     end
 
     for x in -subnormals
         Ndown,Nup,N,p = test_chances_round(x)
         @test Ndown + Nup == N
+        Ndown + Nup != N && @info "Test failed for $x, $(bitstring(Float16(x)))"
         @test isapprox(Ndown/N,1-p,atol=2e-2)
         @test isapprox(Nup/N,p,atol=2e-2)
     end
 end
 
+# USED TO TEST SOME SUBNORMALS MANUALLY
+# const ϵ = prevfloat(Float32(nextfloat(zero(Float16))),1)
+
+# function g(x::Float32,r::Float32)
+#     # x + ϵ*(r-1.5f0)
+#     x + randfloat(reinterpret(UInt32,r))
+# end
+
+# function randfloat(ui::UInt32)
+
+#     lz = leading_zeros(ui)
+#     e = ((101 - lz) % UInt32) << 23
+#     e |= (ui << 31)
+
+#     # combine exponent and significand
+#     return reinterpret(Float32,e | (ui & 0x007f_ffff))
+# end
+
+
+# function h(x::Float32)
+#     # deterministically rounded correct value in uint16
+#     c = Int(reinterpret(UInt16,Float16(x)))
+
+#     v = zeros(Int16,2^23)
+#     r = 1f0
+
+#     for i in 0:2^23-1
+#         v[i+1] = Int(reinterpret(UInt16,Float16(g(x,r)))) - c
+#         r = nextfloat(r)
+#     end
+#     return v
+# end
diff --git a/test/float32sr.jl b/test/float32sr.jl
@@ -112,7 +112,10 @@ function test_chances_round(f64::Float64;N::Int=100_000)
     p = Float32_chance_roundup(f64)
 
     f32_round = Float32sr(f64)
-    if Float64(f32_round) <= f64
+    if Float64(f32_round) == f64
+        f32_roundup = f32_round
+        f32_rounddown = f32_round
+    elseif Float64(f32_round) < f64
         f32_rounddown = f32_round
         f32_roundup = nextfloat(f32_round)
     else
@@ -173,7 +176,7 @@ end
     floatminF32 = reinterpret(UInt64,Float64(floatmin(Float32)))
     minposF32 = reinterpret(UInt64,Float64(nextfloat(zero(Float32))))
 
-    N = 100
+    N = 5000
     subnormals = reinterpret.(Float64,rand(minposF32:floatminF32,N))
     for x in subnormals
         @test test_chances_round(x)