diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 9095b056ae782..b622d226a6569 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -422,6 +422,24 @@ set(arm_or_thumb2_base_SOURCES
   ${GENERIC_SOURCES}
 )
 
+option(COMPILER_RT_ARM_OPTIMIZED_FP
+  "On 32-bit Arm, use optimized assembly implementations of FP arithmetic. Likely to increase code size, but be faster." ON)
+
+if(COMPILER_RT_ARM_OPTIMIZED_FP)
+  set(assembly_files
+    arm/mulsf3.S
+    arm/divsf3.S)
+  set_source_files_properties(${assembly_files}
+    PROPERTIES COMPILE_OPTIONS "-Wa,-mimplicit-it=always")
+  set(arm_or_thumb2_base_SOURCES
+    ${assembly_files}
+    arm/fnan2.c
+    arm/fnorm2.c
+    arm/funder.c
+    ${arm_or_thumb2_base_SOURCES}
+  )
+endif()
+
 set(arm_sync_SOURCES
   arm/sync_fetch_and_add_4.S
   arm/sync_fetch_and_add_8.S
@@ -455,6 +473,16 @@ set(thumb1_base_SOURCES
   ${GENERIC_SOURCES}
 )
 
+if(COMPILER_RT_ARM_OPTIMIZED_FP)
+  set(thumb1_base_SOURCES
+    arm/thumb1/mulsf3.S
+    arm/fnan2.c
+    arm/fnorm2.c
+    arm/funder.c
+    ${thumb1_base_SOURCES}
+  )
+endif()
+
 set(arm_EABI_RT_SOURCES
   arm/aeabi_cdcmp.S
   arm/aeabi_cdcmpeq_check_nan.c
diff --git a/compiler-rt/lib/builtins/arm/divsf3.S b/compiler-rt/lib/builtins/arm/divsf3.S
new file mode 100644
index 0000000000000..2f37234457b7b
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/divsf3.S
@@ -0,0 +1,608 @@
+//===-- divsf3.S - single-precision floating point division ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float division with the IEEE-754
+// default rounding (to nearest, ties to even), in optimized AArch32 assembly
+// language suitable to be built as either Arm or Thumb2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+
+  .syntax unified
+  .text
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fdiv, __divsf3)
+
+DEFINE_COMPILERRT_FUNCTION(__divsf3)
+  // Extract the exponents of the inputs into r2 and r3, occupying bits 16-23
+  // of each register so that there will be space lower down to store extra
+  // data without exponent arithmetic carrying into it. In the process, check
+  // both exponents for 00 or FF and branch out of line to handle all the
+  // uncommon types of value (infinity, NaN, zero, denormals).
+  //
+  // Chaining conditional instructions like this means that the second
+  // instruction (setting up r3) might not be executed at all, so fdiv_uncommon
+  // will have to redo it just in case. That saves an instruction here,
+  // executed for _all_ inputs, and moves it to the uncommon path run for only
+  // some inputs.
+  mov     r12, #0xFF0000
+  ands    r2, r12, r0, lsr #7   // r2 has exponent of numerator. (Is it 0?)
+  andsne  r3, r12, r1, lsr #7   // r3 has exponent of denominator. (Is it 0?)
+  teqne   r2, r12               // if neither was 0, is one FF?
+  teqne   r3, r12               // or the other?
+  beq     LOCAL_LABEL(uncommon)         // branch out of line if any answer was yes
+
+  // Calculate the output sign, which is always just the XOR of the input
+  // signs. Store it in bit 8 of r2, below the numerator exponent.
+  teq     r0, r1                // is the output sign bit 1?
+  orrmi   r2, r2, #0x100        // if so, set bit 8 of r2
+
+  // Isolate the mantissas of both values, by setting bit 23 of each one and
+  // clearing the 8 bits above that.
+  //
+  // In the process, swap the register allocations (which doesn't cost extra
+  // instructions if we do it as part of this manipulation). We want the
+  // numerator not to be in r0, because r0 is where we'll build up the quotient
+  // while subtracting things from the numerator.
+  orr     r12, r0, #1 << 23
+  orr     r0, r1, #1 << 23
+  bic     r1, r12, #0xFF000000
+  bic     r0, r0, #0xFF000000
+
+LOCAL_LABEL(div):
+  // Start of the main division. We get here knowing that:
+  //
+  //   r0 = mantissa of denominator, with the leading 1 at bit 23
+  //   r1 = mantissa of numerator, similarly
+  //   r2 = (exponent of numerator << 16) + (result sign << 8)
+  //   r3 = (exponent of denominator << 16)
+
+  push    {r14}                 // we'll need an extra register
+
+  // Calculate the initial result exponent by just subtracting the two input
+  // exponents. This doesn't affect the sign bit lower down in r2.
+  sub     r2, r2, r3
+
+  // That initial exponent might need to be adjusted by 1, depending on whether
+  // dividing the mantissas gives a value >=1 or <1. We don't need to wait
+  // until the division is finished to work that out: we can tell immediately
+  // by just comparing the mantissas.
+  //
+  // The basic idea is to do the comparison in a way that sets the C flag if
+  // numerator >= denominator. Then we recombine the sign and exponent by doing
+  // "ADC r2, r2, r2, asr #16": the exponent in the top half of r2 is shifted
+  // down to the low 8 bits, just below the sign bit, and using ADC rather than
+  // ADD folds in the conditional increment from the mantissa comparison.
+  //
+  // If we're not incrementing the output exponent, we instead shift the
+  // numerator mantissa left by 1, so that it _is_ greater than the denominator
+  // mantissa. Otherwise we'd generate only a 22-bit quotient, instead of 23.
+  //
+  // The exponent also needs to be rebiased, so that dividing two numbers the
+  // same gives an output exponent of 0x7F. If the two inputs have the same
+  // exponent then we'll have computed an exponent of 0 via the SUB instruction
+  // above; if the mantissas are the same as well then the ADC will increment
+  // it; also, the leading bit of the quotient will increment the exponent
+  // again when we recombine it with the output mantissa later. So we need to
+  // add (0x7F - 2) to the mantissa now, to make an exponent of 0 from the SUB
+  // come to 0x7F after both of those increments.
+  //
+  // Putting all of that together, what we _want_ to do is this:
+  //
+  // [#1]   CMP     r1, r0                // set C if num >= den
+  // [#2]   MOVLO   r1, r1, lsl #1        // if num < den, shift num left
+  // [#3]   ADD     r2, r2, #0x7D0000     // rebias exponent
+  // [#4]   ADC     r2, r2, r2, asr #16   // combine sign + exp + adjustment
+  //
+  // However, we only do the first of those four instructions right here. The
+  // other three are distributed through the code below, after unrelated load
+  // or multiply instructions which will have a result delay slot on simple
+  // CPUs. Each is labelled "exponent setup [#n]" in a comment.
+  //
+  // (Since instruction #4 depends on the flags set up by #2, we must avoid
+  // clobbering the flags in _any_ of the instructions interleaved with this!)
+  cmp     r1, r0                // exponent setup [#1]
+
+  // Start the mantissa division by making an approximation to the reciprocal
+  // of the denominator. We first obtain an 8-bit approximation using a table
+  // lookup indexed by the top 7 denominator bits (counting the leading 1, so
+  // really there are only 6 bits in the table index).
+  //
+  // (r0 >> 17) is the table index, and its top bit is always set, so it ranges
+  // from 64 to 127 inclusive. So we point the base register 64 bytes before
+  // the actual table.
+  adr     r12, LOCAL_LABEL(tab) - 64
+#if __thumb__
+  // Thumb can't do this particular shift+add+load in one instruction - it only
+  // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must
+  // calculate the load offset separately.
+  add     r14, r12, r0, lsr #17
+  ldrb    r14, [r14]
+#else
+  ldrb    r14, [r12, r0, lsr #17]
+#endif
+
+  // Now do an iteration of Newton-Raphson to improve that 8-bit approximation
+  // to have 15-16 accurate bits.
+  //
+  // Basics of Newton-Raphson for finding a reciprocal: if you want to find 1/d
+  // and you have some approximation x, your next approximation is X = x(2-dx).
+  // Looked at one way, this is the result of applying the N-R formula
+  // X=x-f(x)/f'(x) to the function f(x) = 1/x - d. Another way to look at it
+  // is to suppose that dx = 1 - e, for some e which is small (because dx is
+  // already reasonably close to 1). Then you want to double the number of
+  // correct bits in the next approximation, i.e. square the error. So you want
+  // dX = 1-e^2 = (1-e)(1+e) = dx(2-dx). Cancelling d gives X = x(2-dx) again.
+  //
+  // In this situation, we're working in fixed-point integers rather than real
+  // numbers, and all the scales are different:
+  //  * our input denominator d is in the range [2^23,2^24)
+  //  * our input approximation x is in the range [2^7,2^8)
+  //  * we want the output approximation to be in the range [2^15,2^16)
+  // Those factors combine to mean that we want
+  //   x(2^32-dx) / 2^23
+  // = (2^9 x) - (dx^2 / 2^23)
+  //
+  // But we also want to compute this using ordinary MUL, not a long multiply
+  // instruction (those are slower). So we need to worry about the product
+  // overflowing. dx fits in 32 bits, because it's the product of something
+  // <2^24 with something <2^8; but we must shift it right before multiplying
+  // by x again.
+
+  mul     r12, r0, r14          // r12  = dx
+  movlo   r1, r1, lsl #1        //   exponent setup [#2] in the MUL delay slot
+  mvn     r12, r12, lsr #8      // r12 ~= -dx/2^8
+  mul     r3, r12, r14          // r3  ~= -dx^2/2^8
+  mov     r14, r14, lsl #9      // r14  = 2^9 x
+  add     r14, r14, r3, asr #15 // r14 ~= 2^9 x - dx^2 / 2^23
+
+  // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa,
+  // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16
+  // in principle, and the max mantissa 2^24-1 would have reciprocal just over
+  // 2^15). The error is always negative (r14 is an underestimate of the true
+  // value), and the maximum error is 6 and a bit ULP (that is, the true
+  // reciprocal is strictly less than (r14+7)). Also, r14 is always strictly
+  // less than 0x10000 (even in the case of the min mantissa, where the true
+  // value would be _exactly_ 0x10000), which eliminates a case of integer
+  // overflow.
+  //
+  // All of these properties of the reciprocal approximation are checked by
+  // exhaustively iterating over all 2^23 possible input mantissas. (The nice
+  // thing about doing this in single rather than double precision!)
+  //
+  // Now we extract most of the quotient by two steps of long division, using
+  // the reciprocal estimate to identify a multiple of the denominator to
+  // subtract from the numerator. To avoid integer overflow, the numerator
+  // mantissa is shifted down 8 bits so that it's less than 0x10000. After we
+  // calculate an approximate quotient, we shift the numerator left and
+  // subtract that multiple of the denominator, moving the next portion of the
+  // numerator into range for the next iteration.
+
+  // First iteration of long division. We shift the numerator left 11 bits, and
+  // since the quotient approximation is scaled by 2^31, we must shift that
+  // right by 20 to make the right product to subtract from the numerator.
+  mov     r12, r1, lsr #8       // shift the numerator down
+  mul     r12, r14, r12         // make the quotient approximation
+  mov     r1, r1, lsl #11       // shift numerator left, ready for subtraction
+  mov     r3, r12, lsr #20      // make first 12-bit block of quotient bits
+  mls     r1, r0, r3, r1        // subtract that multiple of den from num
+
+  add     r2, r2, #0x7D0000     //   exponent setup [#3] in the MLS delay slot
+
+  // Second iteration of long division. Differences from the first step: this
+  // time we shift the numerator 12 bits instead of 11, so that the total of
+  // both steps is 23 bits, i.e. we've shifted up by exactly the full width of
+  // the output mantissa. Also, the block of output quotient bits is left in a
+  // different register: it was in r3 the first time, and this time it's in
+  // r12, so that we still have both available at the end of the process.
+  mov     r12, r1, lsr #8       // shift the numerator down
+  mul     r12, r14, r12         // make the quotient approximation
+  mov     r1, r1, lsl #12       // shift numerator left, ready for subtraction
+  mov     r12, r12, lsr #19     // make second 11-bit block of quotient
+  mls     r1, r0, r12, r1       // subtract that multiple of den from num
+
+  adc     r2, r2, r2, asr #16   //   exponent setup [#4] in the MLS delay slot
+
+  // Now r1 contains the original numerator, shifted left 23, minus _some_
+  // multiple of the original denominator (which is still in r0). The bounds on
+  // the error in the above steps should make the error at most 1: that is, we
+  // may have to subtract the denominator one more time to make r1 < r0, and
+  // increment the quotient by one more.
+  //
+  // Our quotient is still in two pieces, computed separately in the above long
+  // division steps. We fold the final increment into the same instruction that
+  // recombines them, by doing the comparison in such a way that it sets the
+  // carry flag if the increment is needed.
+
+  cmp     r1, r0                // Set carry flag if num >= den
+  subhs   r1, r1, r0            // If so, subtract den from num
+  adc     r3, r12, r3, lsl #12  // Recombine quotient halves, plus optional +1
+
+  // We've finished with r14 as a temporary register, so we can unstack it now.
+  pop     {r14}
+
+  // Now r3 contains the _rounded-down_ output quotient, and r1 contains the
+  // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and
+  // 0 <= r1 < denominator.
+  //
+  // Next we must round to nearest, by checking if r1 is greater than half the
+  // denominator. In division, it's not possible to hit an exact round-to-even
+  // halfway case, so we don't need to spend any time checking for it.
+  //
+  // Proof of no round-to-even: define the 'width' of a dyadic rational to be
+  // the distance between the lowest and highest 1 bits in its binary
+  // representation, or equivalently, the index of its high bit if you scale it
+  // by a power of 2 to make it an odd integer. E.g. any actual power of 2 has
+  // width 0, and all of 0b11110, 0b1111, 0b11.11 and 0b0.01111 have width 3.
+  // Then for any dyadic rationals a,b, width(ab) >= width(a)+width(b). Let w
+  // be the maximum width that the input precision supports (so that for single
+  // precision, w=23). Then if some division n/d were a round-to-even case, the
+  // true quotient q=n/d would have width exactly w+1. But we have qd=n, so
+  // width(n) >= width(q)+width(d) > w, which can't happen, because n is in the
+  // input precision, hence had width <= w.)
+  //
+  // So we don't need to check for an exact _halfway_ case and clear the low
+  // bit of the quotient after rounding up, as addition and multiplication both
+  // need to do. But we do need to remember if the quotient itself was exact,
+  // that is, if there was no remainder at all. That's needed in underflow
+  // handling.
+
+  // The rounding check wants to compare remainder with denominator/2. But of
+  // course in integers it's easier to compare 2*remainder with denominator. So
+  // we start by shifting the remainder left by 1, and in the process, set Z if
+  // it's exactly 0 (i.e. the result needs no rounding at all).
+  lsls    r1, r1, #1
+  // Now trial-subtract the denominator. We don't do this at all if the result
+  // was exact. If we do do it, r1 goes negative precisely if we need to round
+  // up, which sets the C flag. (The previous instruction will have left C
+  // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if
+  // we're rounding up.)
+  subsne  r1, r1, r0
+  // Recombine the quotient with the sign + exponent, and use the C flag from
+  // the previous instruction to increment the quotient if we're rounding up.
+  adc     r0, r3, r2, lsl #23
+
+  // If we haven't either overflowed or underflowed, we're done. We can
+  // identify most of the safe cases by doing an unsigned comparison of the
+  // initial output exponent (in the top half of r2) with 0xFC: if 0 <= r2 <
+  // 0xFC0000 then we have neither underflow nor overflow.
+  //
+  // Rationale: the value in the top half of r2 had three chances to be
+  // incremented before becoming the exponent field of the actual output float.
+  // It was incremented if we found the numerator mantissa was >= the
+  // denominator (producing the value in the _bottom_ half of r2, which we just
+  // ADCed into the output). Then it gets unconditionally incremented again
+  // when the ADC combines it with the leading mantissa bit. And finally,
+  // round-up might increment it a third time. So 0xFC is the smallest value
+  // that can possibly turn into the overflowed value 0xFF after all those
+  // increments.
+  //
+  // On the underflow side, (top half of r2) = 0 corresponds to a value of 1 in
+  // the final result's exponent field (and then rounding might increase it
+  // further); if the exponent was less than that then r2 wraps round and looks
+  // like a very large positive integer from the point of view of this unsigned
+  // comparison.
+  cmp     r2, #0xFC0000
+  bxlo    lr
+
+  // The same comparison will have set the N and V flags to reflect the result
+  // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably
+  // distinguishes potential underflow (r2 is negative) from potential overflow
+  // (r2 is positive and at least 0xFC0000)
+  bge     LOCAL_LABEL(overflow)
+
+  // Here we might or might not have underflow (but we know we don't have
+  // overflow). To check more carefully, we look at the _bottom_ half of r2,
+  // which contains the exponent after the first adjustment (for num >= denom),
+  // That is, it's still off by 1 (compensating for the leading quotient bit),
+  // and is also before rounding.
+  //
+  // We neglect the effect of rounding: division results that are tiny (less
+  // than the smallest normalised number) before rounding, but then round up to
+  // the smallest normal number, are an acceptable edge case to handle slowly.
+  // We pass those to funder without worrying about them.
+  //
+  // So we want to check whether the bottom half of r2 was negative. It would
+  // be nice to check bits 8-15 of it, but unfortunately, it's already been
+  // combined with the sign (at bit 8), so those bits don't tell us anything
+  // useful. Instead we look at the top 4 bits of the exponent field, i.e. the
+  // 0xF0 bits. The largest _non_-overflowing exponent that might reach here is
+  // less than 3, so it doesn't reach those bits; the smallest possible
+  // underflow, obtained by dividing the smallest denormal by the largest
+  // finite number, is -151 (before the leading bit increments it), which will
+  // set the low 8 bits of r2 to 0x69. That is, the 0xF0 nibble of r2 will be
+  // 0x60 or greater for a (pre-rounding) underflow, and zero for a
+  // non-underflow.
+
+  tst     r2, #0xF0
+  bxeq    lr                    // no underflow after all; return
+
+  // Rebias the exponent for funder, which also corrects the sign bit.
+  add     r0, r0, #192 << 23
+  // Tell funder whether the true value is greater or less than the number in
+  // r0. This is obtained from the sign of the remainder (still in r1), with
+  // the only problem being that it's currently reversed. So negate r1 (leaving
+  // 0 at 0 to indicate exactness).
+  rsbs    r1, r1, #0
+  b     SYMBOL_NAME(__compiler_rt_funder)
+
+LOCAL_LABEL(overflow):
+  // Here we might or might not have overflow (but we know we don't have
+  // underflow). We must check whether we really have overflowed.
+  //
+  // For this it's easiest to check the exponent field in the actual output
+  // value in r0, after _all_ the adjustments have been completed. The largest
+  // overflowed exponent is 0x193, and the smallest exponent that can reach
+  // this is 0xFD (we checked against 0xFC above, but then the leading quotient
+  // bit incremented it). So it's enough to shift the output left by one
+  // (moving the exponent field to the top), increment it once more (so that
+  // the smallest overflowed exponent 0xFF wraps round to 0), and then compare
+  // against 0xFE000000 as an unsigned integer.
+  mov     r12, r0, lsl #1
+  add     r12, r12, #1 << 24
+  cmp     r12, #0xFE << 24      // Check for exp = 253 or 254
+  bxhs    lr
+  // We have actual overflow. Rebias r0 to bring the exponent back into range,
+  // which ensures its sign is correct. Then make an infinity of that sign to
+  // return.
+  subs    r0, r0, #0xC0 << 23
+  movs    r12, #0xFF            // exponent of infinity
+  orrs    r12, r12, r0, lsr #23 // exponent and sign at bottom of r12
+  movs    r0, r12, lsl #23      // shift it up to the top of r0 to return
+  bx      lr
+
+LOCAL_LABEL(uncommon):
+  // We come here from the start of the function if either input is an uncommon
+  // value: zero, denormal, infinity or NaN.
+  //
+  // We arrive here with r12 = 0xFF000000, and r2 containing the exponent of x
+  // in bits 16..23. But r3 doesn't necessarily contain the exponent of y,
+  // because the instruction that set it up was conditional. So first we
+  // unconditionally repeat it.
+  and     r3, r12, r1, lsr #7
+
+  // In all cases not involving a NaN as output, the sign of the output is made
+  // in the same way as for finite numbers, as the XOR of the input signs. So
+  // repeat the sign setup from the main branch.
+  teq     r0, r1                // is the output sign bit 1?
+  orrmi   r2, r2, #0x100        // if so, set bit 8 of r2
+
+  // Detect infinities and NaNs, by checking if either of r2 or r3 is at least
+  // 0xFF0000.
+  cmp     r2, #0xFF0000
+  cmplo   r3, #0xFF0000
+  bhs     LOCAL_LABEL(inf_NaN)
+
+  // Now we know there are no infinities or NaNs, but there's at least one zero
+  // or denormal.
+  movs    r12, r1, lsl #1       // is y zero?
+  beq     LOCAL_LABEL(divbyzero)        // if so, go and handle division by zero
+  movs    r12, r0, lsl #1       // is x zero? (now we know that y is not)
+  moveq   r0, r2, lsl #23       // if so, 0/nonzero is just 0 (of right sign)
+  bxeq    lr
+
+  // Now we've eliminated zeroes as well, leaving only denormals: either x or
+  // y, or both, is a denormal. Call fnorm2 to convert both into a normalised
+  // mantissa and a (potentially small) exponent.
+  and     r12, r2, #0x100       // save the result sign from r2
+  lsr     r2, #16               // shift extracted exponents down to bit 0
+  lsr     r3, #16               // where fnorm2 will expect them
+  push    {r0, r1, r2, r3, r12, lr}
+  mov     r0, sp                // tell fnorm2 where to find its data
+  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
+  pop     {r0, r1, r2, r3, r12, lr}
+  lsl     r3, #16               // shift exponents back up to bit 16
+  orr     r2, r12, r2, lsl #16  // and put the result sign back in r2
+
+  // Now rejoin the main code path, having finished the setup it will expect:
+  // swap x and y, and shift the fractions back down to the low 24 bits.
+  mov     r12, r0, lsr #8
+  mov     r0, r1, lsr #8
+  mov     r1, r12
+  b       LOCAL_LABEL(div)
+
+LOCAL_LABEL(inf_NaN):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to fnan2 to propagate a NaN from the
+  // input.
+  mov     r12, #0xFF000000
+  cmp     r12, r0, lsl #1       // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+  cmp     r12, r1, lsl #1
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+
+  // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and
+  // finite/inf = 0.
+
+  // If both operands are infinity, we return a NaN. Since we know at
+  // least _one_ is infinity, we can test this by checking if they're
+  // equal apart from the sign bits.
+  eor     r3, r0, r1
+  lsls    r3, #1                // were all bits of XOR zero other than top?
+  beq     LOCAL_LABEL(invalid)          // if so, both operands are infinity
+
+  // See if x is infinite
+  cmp     r12, r0, lsl #1       // (r0 << 1) == 0xFF000000?
+  beq     LOCAL_LABEL(infret)           // if so, infinity/finite = infinity
+
+  // y is infinite and x is not, so we return a zero of the
+  // combined sign.
+  eor     r0, r0, r1            // calculate the right sign
+  and     r0, r0, #0x80000000   // throw away everything else
+  bx      lr
+
+LOCAL_LABEL(divbyzero):
+  // Here, we know y is zero. But we don't know if x is zero or nonzero. So we
+  // might be calculating 0/0 (invalid operation, generating a NaN), or
+  // nonzero/0 (the IEEE "division by zero" exception, generating infinity).
+  movs    r12, r0, lsl #1       // is x zero too?
+  beq     LOCAL_LABEL(invalid)          // if so, go and return a NaN
+
+LOCAL_LABEL(infret):
+  // Here, we're either dividing infinity by a finite number, or dividing a
+  // nonzero number by 0. (Or both, if we're dividing infinity by 0.) In all
+  // these cases we return infinity with the sign from r2.
+  //
+  // If we were implementing IEEE exceptions, we'd have to separate these
+  // cases: infinity / finite is not an _exception_, it just returns infinity,
+  // whereas (finite and nonzero) / 0 is a division-by-zero exception. But here
+  // we're not implementing exceptions, so we can treat all three cases the
+  // same.
+  //
+  // r2 contains the output sign in bit 8, which is a convenient place to find
+  // it when making an infinity, because we can fill in the 8 exponent bits
+  // below that and then shift it left.
+  orr     r2, r2, #0xff         // sign + maximum exponent
+  lsl     r0, r2, #23           // shift up to the top
+  bx      lr
+
+LOCAL_LABEL(invalid):
+  // Return the default NaN, from an invalid operation (either dividing
+  // infinity by infinity, or 0 by 0).
+  ldr     r0, =0x7FC00000
+  bx      lr
+
+// Finally, the lookup table for the initial reciprocal approximation.
+//
+// The table index is made from the top 7 bits of the denominator mantissa. But
+// the topmost bit is always 1, so only the other 6 bits vary. So it only has
+// 64 entries, not 128.
+//
+// Each table entry is a single byte, with its top bit set. So the table
+// entries correspond to the reciprocal of a 7-bit mantissa prefix scaled up by
+// 2^14, or the reciprocal of a whole 24-bit mantissa scaled up by 2^31.
+//
+// Each of these 64 entries corresponds to a large interval of possible
+// mantissas. For example, if the top 7 bits are 1000001 then the overall
+// mantissa could be anything from 0x820000 to 0x83FFFF. And because the output
+// of this table provides more bits than the input, there are several choices
+// of 8-bit reciprocal approximation for a number in that interval. The
+// reciprocal of 0x820000 starts with 0xFC plus a fraction, and the reciprocal
+// of 0x83FFFF starts with 0xF9 minus a fraction, so there are four reasonable
+// choices for that table entry: F9, FA, FB or FC. Which do we pick?
+//
+// The table below is generated by choosing whichever value minimises the
+// maximum possible error _after_ the approximation is improved by the
+// Newton-Raphson step. In the example above, we end up with FA.
+//
+// The Python code below will regenerate the table, complete with the per-entry
+// comments.
+
+/*
+
+for prefix in range(64, 128):
+    best = None
+
+    # Max and min 23-bit mantissas with this 7-bit prefix
+    mmin, mmax = prefix * 2**17, (prefix + 1) * 2**17 - 1
+
+    # Max and min table entry corresponding to the reciprocal of something in
+    # that range of mantissas: round up the reciprocal of mmax, and round down
+    # the reciprocal of mmin. Also clamp to the range [0x80,0xff], because
+    # 0x100 can't be used as a table entry due to not fitting in a byte, even
+    # though it's the exact reciprocal of the overall-smallest mantissa
+    # 0x800000.
+    gmin = max(128, (2**31 + mmin - 1) // mmax)
+    gmax = min(255, 2**31 // mmin)
+
+    # For each of those table entries, compute the result of starting from that
+    # value and doing a Newton-Raphson iteration, with the mantissa at each end
+    # of the mantissa interval. One of these will be the worst possible error.
+    # Choose the table entry whose worst error is as small as possible.
+    #
+    # (To find the extreme values of a more general function on an interval,
+    # you must consider its values not only at the interval endpoints but also
+    # any turning points within the interval. Here, the function has only one
+    # turning point, and by construction it takes value 0 there, so we needn't
+    # worry.)
+    g = max(
+        range(gmin, gmax + 1),
+        key=lambda g: min(
+            (g * (2**32 - d * g) / 2**23 - 2**39 / d) for d in [mmin, mmax]
+        ),
+    )
+
+    print(f"  .byte 0x{g:02x}  // input [0x{mmin:06x},0x{mmax:06x}]"
+          f", candidate outputs [0x{gmin:02x},0x{gmax:02x}]"
+    )
+
+*/
+
+  .p2align 2  // make sure we start on a 4-byte boundary, even in Thumb
+LOCAL_LABEL(tab):
+  .byte 0xfe  // input [0x800000,0x81ffff], candidate outputs [0xfd,0xff]
+  .byte 0xfa  // input [0x820000,0x83ffff], candidate outputs [0xf9,0xfc]
+  .byte 0xf6  // input [0x840000,0x85ffff], candidate outputs [0xf5,0xf8]
+  .byte 0xf3  // input [0x860000,0x87ffff], candidate outputs [0xf1,0xf4]
+  .byte 0xef  // input [0x880000,0x89ffff], candidate outputs [0xee,0xf0]
+  .byte 0xec  // input [0x8a0000,0x8bffff], candidate outputs [0xeb,0xed]
+  .byte 0xe8  // input [0x8c0000,0x8dffff], candidate outputs [0xe7,0xea]
+  .byte 0xe5  // input [0x8e0000,0x8fffff], candidate outputs [0xe4,0xe6]
+  .byte 0xe2  // input [0x900000,0x91ffff], candidate outputs [0xe1,0xe3]
+  .byte 0xdf  // input [0x920000,0x93ffff], candidate outputs [0xde,0xe0]
+  .byte 0xdc  // input [0x940000,0x95ffff], candidate outputs [0xdb,0xdd]
+  .byte 0xd9  // input [0x960000,0x97ffff], candidate outputs [0xd8,0xda]
+  .byte 0xd6  // input [0x980000,0x99ffff], candidate outputs [0xd5,0xd7]
+  .byte 0xd3  // input [0x9a0000,0x9bffff], candidate outputs [0xd3,0xd4]
+  .byte 0xd1  // input [0x9c0000,0x9dffff], candidate outputs [0xd0,0xd2]
+  .byte 0xce  // input [0x9e0000,0x9fffff], candidate outputs [0xcd,0xcf]
+  .byte 0xcc  // input [0xa00000,0xa1ffff], candidate outputs [0xcb,0xcc]
+  .byte 0xc9  // input [0xa20000,0xa3ffff], candidate outputs [0xc8,0xca]
+  .byte 0xc7  // input [0xa40000,0xa5ffff], candidate outputs [0xc6,0xc7]
+  .byte 0xc4  // input [0xa60000,0xa7ffff], candidate outputs [0xc4,0xc5]
+  .byte 0xc2  // input [0xa80000,0xa9ffff], candidate outputs [0xc1,0xc3]
+  .byte 0xc0  // input [0xaa0000,0xabffff], candidate outputs [0xbf,0xc0]
+  .byte 0xbd  // input [0xac0000,0xadffff], candidate outputs [0xbd,0xbe]
+  .byte 0xbb  // input [0xae0000,0xafffff], candidate outputs [0xbb,0xbc]
+  .byte 0xb9  // input [0xb00000,0xb1ffff], candidate outputs [0xb9,0xba]
+  .byte 0xb7  // input [0xb20000,0xb3ffff], candidate outputs [0xb7,0xb8]
+  .byte 0xb5  // input [0xb40000,0xb5ffff], candidate outputs [0xb5,0xb6]
+  .byte 0xb3  // input [0xb60000,0xb7ffff], candidate outputs [0xb3,0xb4]
+  .byte 0xb1  // input [0xb80000,0xb9ffff], candidate outputs [0xb1,0xb2]
+  .byte 0xaf  // input [0xba0000,0xbbffff], candidate outputs [0xaf,0xb0]
+  .byte 0xad  // input [0xbc0000,0xbdffff], candidate outputs [0xad,0xae]
+  .byte 0xac  // input [0xbe0000,0xbfffff], candidate outputs [0xab,0xac]
+  .byte 0xaa  // input [0xc00000,0xc1ffff], candidate outputs [0xa9,0xaa]
+  .byte 0xa8  // input [0xc20000,0xc3ffff], candidate outputs [0xa8,0xa8]
+  .byte 0xa6  // input [0xc40000,0xc5ffff], candidate outputs [0xa6,0xa7]
+  .byte 0xa5  // input [0xc60000,0xc7ffff], candidate outputs [0xa4,0xa5]
+  .byte 0xa3  // input [0xc80000,0xc9ffff], candidate outputs [0xa3,0xa3]
+  .byte 0xa1  // input [0xca0000,0xcbffff], candidate outputs [0xa1,0xa2]
+  .byte 0xa0  // input [0xcc0000,0xcdffff], candidate outputs [0xa0,0xa0]
+  .byte 0x9e  // input [0xce0000,0xcfffff], candidate outputs [0x9e,0x9f]
+  .byte 0x9d  // input [0xd00000,0xd1ffff], candidate outputs [0x9d,0x9d]
+  .byte 0x9b  // input [0xd20000,0xd3ffff], candidate outputs [0x9b,0x9c]
+  .byte 0x9a  // input [0xd40000,0xd5ffff], candidate outputs [0x9a,0x9a]
+  .byte 0x98  // input [0xd60000,0xd7ffff], candidate outputs [0x98,0x99]
+  .byte 0x97  // input [0xd80000,0xd9ffff], candidate outputs [0x97,0x97]
+  .byte 0x96  // input [0xda0000,0xdbffff], candidate outputs [0x95,0x96]
+  .byte 0x94  // input [0xdc0000,0xddffff], candidate outputs [0x94,0x94]
+  .byte 0x93  // input [0xde0000,0xdfffff], candidate outputs [0x93,0x93]
+  .byte 0x92  // input [0xe00000,0xe1ffff], candidate outputs [0x91,0x92]
+  .byte 0x90  // input [0xe20000,0xe3ffff], candidate outputs [0x90,0x90]
+  .byte 0x8f  // input [0xe40000,0xe5ffff], candidate outputs [0x8f,0x8f]
+  .byte 0x8e  // input [0xe60000,0xe7ffff], candidate outputs [0x8e,0x8e]
+  .byte 0x8d  // input [0xe80000,0xe9ffff], candidate outputs [0x8d,0x8d]
+  .byte 0x8b  // input [0xea0000,0xebffff], candidate outputs [0x8b,0x8c]
+  .byte 0x8a  // input [0xec0000,0xedffff], candidate outputs [0x8a,0x8a]
+  .byte 0x89  // input [0xee0000,0xefffff], candidate outputs [0x89,0x89]
+  .byte 0x88  // input [0xf00000,0xf1ffff], candidate outputs [0x88,0x88]
+  .byte 0x87  // input [0xf20000,0xf3ffff], candidate outputs [0x87,0x87]
+  .byte 0x86  // input [0xf40000,0xf5ffff], candidate outputs [0x86,0x86]
+  .byte 0x85  // input [0xf60000,0xf7ffff], candidate outputs [0x85,0x85]
+  .byte 0x84  // input [0xf80000,0xf9ffff], candidate outputs [0x84,0x84]
+  .byte 0x83  // input [0xfa0000,0xfbffff], candidate outputs [0x83,0x83]
+  .byte 0x82  // input [0xfc0000,0xfdffff], candidate outputs [0x82,0x82]
+  .byte 0x81  // input [0xfe0000,0xffffff], candidate outputs [0x80,0x81]
+
+END_COMPILERRT_FUNCTION(__divsf3)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c
new file mode 100644
index 0000000000000..06bbd4339f171
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/fnan2.c
@@ -0,0 +1,42 @@
+//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This helper function is available for use by single-precision float
+// arithmetic implementations to handle propagating NaNs from the input
+// operands to the output, in a way that matches Arm hardware FP.
+//
+// On input, a and b are floating-point numbers in IEEE 754 encoding, and at
+// least one of them must be a NaN. The return value is the correct output NaN.
+//
+// A signalling NaN in the input (with bit 22 clear) takes priority over any
+// quiet NaN, and is adjusted on return by setting bit 22 to make it quiet. If
+// both inputs are the same type of NaN then the first input takes priority:
+// the input a is used instead of b.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) {
+  // Make shifted-left copies of a and b to discard the sign bit. Then add 1 at
+  // the bit position where the quiet vs signalling bit ended up. This squashes
+  // all the signalling NaNs to the top of the range of 32-bit values, from
+  // 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values
+  // wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect
+  // a signalling NaN by asking if it's greater than 0xff800000, and a quiet
+  // one by asking if it's less than 0x00800000.
+  uint32_t aadj = (a << 1) + 0x00800000;
+  uint32_t badj = (b << 1) + 0x00800000;
+  if (aadj > 0xff800000)   // a is a signalling NaN?
+    return a | 0x00400000; //   if so, return it with the quiet bit set
+  if (badj > 0xff800000)   // b is a signalling NaN?
+    return b | 0x00400000; //   if so, return it with the quiet bit set
+  if (aadj < 0x00800000)   // a is a quiet NaN?
+    return a;              // if so, return it
+  return b;                // otherwise we expect b must be a quiet NaN
+}
diff --git a/compiler-rt/lib/builtins/arm/fnorm2.c b/compiler-rt/lib/builtins/arm/fnorm2.c
new file mode 100644
index 0000000000000..29eba1cbde59d
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/fnorm2.c
@@ -0,0 +1,62 @@
+//===-- fnorm2.c - Handle single-precision denormal inputs to binary op ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This helper function is available for use by single-precision float
+// arithmetic implementations, to handle denormal inputs on entry by
+// renormalizing the mantissa and modifying the exponent to match.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+// Structure containing the function's inputs and outputs.
+//
+// On entry: a, b are two input floating-point numbers, still in IEEE 754
+// encoding. expa and expb are the 8-bit exponents of those numbers, extracted
+// and shifted down to the low 8 bits of the word, with no other change.
+// Neither value should be zero, or have the maximum exponent (indicating an
+// infinity or NaN).
+//
+// On exit: each of a and b contains the mantissa of the input value, with the
+// leading 1 bit made explicit, and shifted up to the top of the word. If expa
+// was zero (indicating that a was denormal) then it is now represented as a
+// normalized number with an out-of-range exponent (zero or negative). The same
+// applies to expb and b.
+struct fnorm2 {
+  uint32_t a, b, expa, expb;
+};
+
+void __compiler_rt_fnorm2(struct fnorm2 *values) {
+  // Shift the mantissas of a and b to the right place to follow a leading 1 in
+  // the top bit, if there is one.
+  values->a <<= 8;
+  values->b <<= 8;
+
+  // Test if a is denormal.
+  if (values->expa == 0) {
+    // If so, decide how much further up to shift its mantissa, and adjust its
+    // exponent to match. This brings the leading 1 of the denormal mantissa to
+    // the top of values->a.
+    uint32_t shift = __builtin_clz(values->a);
+    values->a <<= shift;
+    values->expa = 1 - shift;
+  } else {
+    // Otherwise, leave the mantissa of a in its current position, and OR in
+    // the explicit leading 1.
+    values->a |= 0x80000000;
+  }
+
+  // Do the same operation on b.
+  if (values->expb == 0) {
+    uint32_t shift = __builtin_clz(values->b);
+    values->b <<= shift;
+    values->expb = 1 - shift;
+  } else {
+    values->b |= 0x80000000;
+  }
+}
diff --git a/compiler-rt/lib/builtins/arm/funder.c b/compiler-rt/lib/builtins/arm/funder.c
new file mode 100644
index 0000000000000..fd29e157328a3
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/funder.c
@@ -0,0 +1,78 @@
+//===-- funder.c - Handle single-precision floating-point underflow -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This helper function is available for use by single-precision float
+// arithmetic implementations to handle underflowed output values, if they were
+// computed in the form of a normalized mantissa and an out-of-range exponent.
+//
+// On input: x should be a complete IEEE 754 floating-point value representing
+// the desired output scaled up by 2^192 (the same value that would have been
+// passed to an underflow trap handler in IEEE 754:1985).
+//
+// This isn't enough information to re-round to the correct output denormal
+// without also knowing whether x itself has already been rounded, and which
+// way. 'errsign' gives this information, by indicating the sign of the value
+// (true result - x). That is, if errsign > 0 it means the true value was
+// larger (x was rounded down); if errsign < 0 then x was rounded up; if
+// errsign == 0 then x represents the _exact_ desired output value.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#define SIGNBIT 0x80000000
+#define MANTSIZE 23
+#define BIAS 0xc0
+
+uint32_t __compiler_rt_funder(uint32_t x, uint32_t errsign) {
+  uint32_t sign = x & SIGNBIT;
+  uint32_t exponent = (x << 1) >> 24;
+
+  // Rule out exponents so small (or large!) that no denormalisation
+  // is needed.
+  if (exponent > BIAS) {
+    // Exponent 0xc1 or above means a normalised number got here by
+    // mistake, so we just remove the 0xc0 exponent bias and go
+    // straight home.
+    return x - (BIAS << MANTSIZE);
+  }
+  uint32_t bits_lost = BIAS + 1 - exponent;
+  if (bits_lost > MANTSIZE + 1) {
+    // The implicit leading 1 of the intermediate value's mantissa is
+    // below the lowest mantissa bit of a denormal by at least 2 bits.
+    // Round down to 0 unconditionally.
+    return sign;
+  }
+
+  // Make the full mantissa (with leading bit) at the top of the word.
+  uint32_t mantissa = 0x80000000 | (x << 8);
+  // Adjust by 1 depending on the sign of the error.
+  mantissa -= errsign >> 31;
+  mantissa += (-errsign) >> 31;
+
+  // Shift down to the output position, keeping the bits shifted off.
+  uint32_t outmant, shifted_off;
+  if (bits_lost == MANTSIZE + 1) {
+    // Special case for the exponent where we have to shift the whole
+    // of 'mantissa' off the bottom of the word.
+    outmant = 0;
+    shifted_off = mantissa;
+  } else {
+    outmant = mantissa >> (8 + bits_lost);
+    shifted_off = mantissa << (32 - (8 + bits_lost));
+  }
+
+  // Re-round.
+  if (shifted_off >> 31) {
+    outmant++;
+    if (!(shifted_off << 1))
+      outmant &= ~1; // halfway case: round to even
+  }
+
+  return sign | outmant;
+}
diff --git a/compiler-rt/lib/builtins/arm/mulsf3.S b/compiler-rt/lib/builtins/arm/mulsf3.S
new file mode 100644
index 0000000000000..b4f4c5e958c52
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/mulsf3.S
@@ -0,0 +1,309 @@
+//===-- mulsf3.S - single-precision floating point multiplication ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float multiplication with the
+// IEEE-754 default rounding (to nearest, ties to even), in optimized AArch32
+// assembly language suitable to be built as either Arm or Thumb2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+
+  .syntax unified
+  .text
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
+
+DEFINE_COMPILERRT_FUNCTION(__mulsf3)
+
+  // Check if either input exponent is 00 or FF (i.e. not a normalized number),
+  // and if so, branch out of line. If we don't branch out of line, then we've
+  // also extracted the exponents of the input values r0/r1 into bits 16..23 of
+  // r2/r3. But if we do, then that hasn't necessarily been done (because the
+  // second AND might have been skipped).
+  mov     r12, #0xFF0000
+  ands    r2, r12, r0, lsr #7  // sets Z if exponent of x is 0
+  andsne  r3, r12, r1, lsr #7  // otherwise, sets Z if exponent of y is 0
+  teqne   r2, r12              // otherwise, sets Z if exponent of x is FF
+  teqne   r3, r12              // otherwise, sets Z if exponent of y is FF
+  beq     LOCAL_LABEL(uncommon)        // branch out of line to handle inf/NaN/0/denorm
+
+  // Calculate the sign of the result, and put it in an unused bit of r2.
+  teq     r0, r1               // sets N to the XOR of x and y's sign bits
+  orrmi   r2, r2, #0x100       // if N set, set bit 8 of r2
+
+  // Move the input mantissas to the high end of r0/r1, each with its leading
+  // bit set explicitly, so that they're in the right form to be multiplied.
+  mov     r12, #0x80000000
+  orr     r0, r12, r0, lsl #8
+  orr     r1, r12, r1, lsl #8
+
+  // Now we're ready to multiply mantissas. This is also the place we'll come
+  // back to after decoding denormal inputs. The denormal decoding will also
+  // have to set up the same register contents:
+  //  - decoded fractions at the top of r0 and r1
+  //  - exponents in r2 and r3, starting at bit 16
+  //  - output sign in r2 bit 8
+LOCAL_LABEL(mul):
+
+  // Here we multiply the mantissas, and compute the output exponent by adding
+  // the input exponents and rebiasing. These operations are interleaved to
+  // use a delay slot.
+  //
+  // The exponent is rebiased by subtracting 0x80, rather than the 0x7F you'd
+  // expect. That compensates for the leading bit of the mantissa overlapping
+  // it, when we recombine the exponent and mantissa by addition.
+  add     r2, r2, r3           // r2 has sum of exponents, freeing up r3
+  umull   r1, r3, r0, r1       // r3:r1 has the double-width product
+  sub     r2, r2, #(0x80 << 16) // rebias the summed exponent
+
+  // Compress the double-word product into just the high-order word r3, by
+  // setting its bit 0 if any bit of the low-order word is nonzero. This
+  // changes the represented value, but not by nearly enough to affect
+  // rounding, because rounding only depends on the bit below the last output
+  // bit, and the general question of whether _any_ nonzero bit exists below
+  // that.
+  cmp     r1, #0                // if low word of full product is nonzero
+  orrne   r3, r3, #1            //   then set LSB of high word
+
+  // The two inputs to UMULL had their high bits set, that is, were at least
+  // 0x80000000. So the 64-bit product was at least 0x4000000000000000, i.e.
+  // the high bit of the product could be at the top of the word or one bit
+  // below. Check which, by experimentally shifting left, and then undoing it
+  // via RRX if we turned out to have shifted off a 1 bit.
+  lsls    r3, r3, #1            // shift left, setting C to the bit shifted off
+  rrxcs   r3, r3                // if that bit was 1, put it back again
+
+  // That ensured the leading 1 bit of the product is now the top of r3, but
+  // also, set C if the leading 1 was _already_ in the top bit. So now we know
+  // whether to increment the exponent. The following instruction does the
+  // conditional increment (because it's ADC), but also, copies the exponent
+  // field from bit 16 of r2 into bit 0, so as to place it just below the
+  // output sign bit.
+  //
+  // So, if the number hasn't overflowed or underflowed, the low 9 bits of r2
+  // are exactly what we need to combine with the rounded mantissa. But the
+  // full output exponent (with extra bits) is still available in the high half
+  // of r2, so that we can check _whether_ we overflowed or underflowed.
+  adc     r2, r2, r2, asr #16
+
+  // Recombine the exponent and mantissa, doing most of the rounding as a side
+  // effect: we shift the mantissa right so as to put the round bit into C, and
+  // then we recombine with the exponent using ADC, to increment the mantissa
+  // if C was set.
+  movs    r12, r3, lsr #8
+  adc     r0, r12, r2, lsl #23
+
+  // To complete the rounding, we must check for the round-to-even tiebreaking
+  // case, by checking if we're in the exact halfway case, which occurs if and
+  // only if we _did_ round up (we can tell this because C is still set from
+  // the MOVS), and also, no bit of r3 is set _below_ the round bit.
+  //
+  // We combine this with an overflow check, so that C ends up set if anything
+  // weird happened, and clear if we're completely finished and can return.
+  //
+  // The best instruction sequence for this part varies between Arm and Thumb.
+#if !__thumb__
+  // Arm state: if C was set then we check the low bits of r3, so that Z ends
+  // up set if we need to round to even.
+  //
+  // (We rely here on Z reliably being clear to begin with, because shifting
+  // down the output mantissa definitely gave a nonzero output. Also, the TST
+  // doesn't change C, so if Z does end up set, then C was also set.)
+  //
+  // Then, if we're not rounding to even, we do a CMP which sets C if there's
+  // been an overflow or an underflow. An overflow could occur for an output
+  // exponent as low as 0xFC, because we might increment the exponent by 1 when
+  // renormalizing, by another when recombining with the mantissa, and by one
+  // more if rounding up causes a carry off the top of the mantissa. An
+  // underflow occurs only if the output exponent is negative (because it's
+  // offset by 1, so an exponent of 0 will be incremented to 1), in which case
+  // the top 8 bits of r2 will all be set. Therefore, an unsigned comparison to
+  // see if r2 > 0xFC0000 will catch all overflow and underflow cases. It also
+  // catches a few very large cases that _don't_ quite overflow (exponents of
+  // 0xFC and above that don't get maximally unlucky); those will also be
+  // handled by the slow path.
+  tstcs   r3, #0x7F
+  cmpne   r2, #0xFC0000
+#else
+  // In Thumb, switching between different conditions has a higher cost due to
+  // the (implicit in this code) IT instructions, so we prefer a strategy that
+  // uses CC and CS conditions throughout, at the cost of requiring some extra
+  // cleanup instructions on the slow path.
+  //
+  // If C is set (and hence round-to-even is a possibility), the basic idea is
+  // to shift the full result word (r3) left by 25, leaving only its bottom 7
+  // bits, which are now the top 7 bits; then we want to set C iff these are 0.
+  //
+  // The "CMP x,y" instruction sets C if y > x (as unsigned integers). So this
+  // could be done in one instruction if only we had a register to use as x,
+  // which has 0 in the top 7 bits and at least one nonzero. Then we could
+  // compare that against the shifted-up value of r3, setting C precisely if
+  // the top 7 bits of y are greater than 0. And happily, we _do_ have such a
+  // register! r12 contains the shifted-down mantissa, which is guaranteed to
+  // have a 1 in bit 23, and 0 above that.
+  //
+  // The shift of r3 happens only in the second operand of the compare, so we
+  // don't lose the original value of r3 in this process.
+  //
+  // The check for over/underflow is exactly as in the Arm branch above, except
+  // based on a different condition.
+  cmpcs   r12, r3, lsl #25  // now C is set iff we're rounding to even
+  cmpcc   r2, #0xFC0000     // and now it's also set if we've over/underflowed
+#endif
+
+  // That's all the checks for difficult cases done. If C is clear, we can
+  // return.
+  bxcc    lr
+
+  // Now the slower path begins. We have to recover enough information to
+  // handle all of round-to-even, overflow and underflow.
+  //
+  // Round to even is the most likely of these, so we detect it first and
+  // handle it as fast as possible.
+
+#if __thumb__
+  // First, Thumb-specific compensation code. The Arm branch of the #if above
+  // will have set Z=0 to indicate round to even, but the Thumb branch didn't
+  // leave any unambiguous indicator of RTE, so we must retest by checking all
+  // the bits shifted off the bottom of the mantissa to see if they're exactly
+  // the half-way value.
+  lsl     r12, r3, #24           // r12 = round bit and everything below
+  cmp     r12, #0x80000000       // set Z if that is exactly 0x80000000
+#endif
+
+  // Now Z is clear iff we have already rounded up and now must replace that
+  // with rounding to even, which is done by just clearing the low bit of the
+  // mantissa.
+  biceq   r0, r0, #1
+
+  // Redo the over/underflow check (the same way as in both branches above),
+  // and if it doesn't report a danger, we can return the rounded-to-even
+  // answer.
+  cmp     r2, #0xFC0000         // check for over/underflow
+  bxcc    lr                    // and return if none.
+
+  // Now we only have overflow and underflow left to handle. First, find out
+  // which we're looking at. This is easy by testing the top bit of r2, but
+  // even easier by using the fact that the possible positive and negative
+  // values of r2 are widely enough separated that the 0xFC0000 subtracted by
+  // the CMP above won't have made any difference. So the N flag output from
+  // that comparison _already_ tells us which condition we have: if N is set we
+  // have underflow, and if N is clear, overflow.
+  bpl     LOCAL_LABEL(overflow)
+
+  // Here we're handling underflow.
+
+  // Add the IEEE 754:1985 exponent bias which funder will expect. This also
+  // brings the exponent back into a range where it can't possibly have carried
+  // into the sign bit, so the output sign will now be right.
+  add     r0, r0, #(0xC0 << 23)
+
+  // Determine whether we rounded up, down or not at all.
+  lsls    r2, r3, #1              // input mantissa, without its leading 1
+  subs    r1, r2, r0, lsl #9      // subtract the output mantissa (likewise)
+
+  // And let funder handle the rest.
+  b     SYMBOL_NAME(__compiler_rt_funder)
+
+LOCAL_LABEL(overflow):
+  // We come here to handle overflow, but it's not guaranteed that an overflow
+  // has actually happened: our check on the fast path erred on the side of
+  // caution, by catching any output exponent that _could_ cause an overflow.
+  // So first check whether this really is an overflow, by extracting the
+  // output exponent. Exponent 0xFF, or anything that wrapped round to having
+  // the high bit clear, are overflows; 0xFE down to 0xFC are not overflows.
+  //
+  // The value in r0 is correct to return, if there's no overflow.
+  add     r12, r0, #(1 << 23)     // add 1 to the exponent so 0xFF wraps to 0
+  movs    r12, r12, lsl #1        // test the top bit of the modified value
+  bxmi    lr                      // if top bit is still 1, not an overflow
+
+  // This is an overflow, so we need to replace it with an appropriately signed
+  // infinity. First we correct the sign by applying a downward bias to the
+  // exponent (the one suggested in IEEE 754:1985, which was chosen to bring
+  // all possible overflowed results back into range).
+  subs    r0, r0, #(0xC0 << 23)
+
+  // Now the sign bit of r0 is correct. Replace everything else with the
+  // encoding of an infinity.
+  mov     r1, #0xFF
+  and     r0, r0, #0x80000000
+  orr     r0, r0, r1, lsl #23
+  bx      lr
+
+LOCAL_LABEL(uncommon):
+  // Handle zeros, denorms, infinities and NaNs. We arrive here knowing that
+  // we've at least done the first _two_ instructions from the entry point,
+  // even if all the rest were skipped. So r2 contains the sign and exponent of
+  // x in bits 16..23, and r12 = 0xFF << 16.
+  //
+  // So, first repeat some instructions from the prologue, which were either
+  // conditionally skipped in the sequence leading to the branch, or skipped
+  // because they happened after the branch.
+  and     r3, r12, r1, lsr #7  // get exponent of y in r3 bits 16..23
+  teq     r0, r1               // calculate the sign of the result
+  orrmi   r2, r2, #0x100       // and put it in bit 8 of r2 as before
+
+  // Check for infinities and NaNs, by testing each of r2,r3 to see if it's at
+  // least 0xFF0000 (hence the exponent field is equal to 0xFF).
+  cmp     r2, r12
+  cmplo   r3, r12
+  bhs     LOCAL_LABEL(inf_NaN)
+
+  // If we didn't take that branch, then we have only finite numbers, but at
+  // least one is denormal or zero. A zero makes the result easy (and also is a
+  // more likely input than a denormal), so check those first, as fast as
+  // possible.
+  movs    r12, r0, lsl #1          // Z set if x == 0
+  movsne  r12, r1, lsl #1          // now Z set if either input is 0
+  moveq   r0, r2, lsl #23          // in either case, make 0 of the output sign
+  bxeq    lr                       // and return it
+
+  // Now we know we only have denormals to deal with. Call fnorm2 to sort
+  // them out, and rejoin the main code path above.
+  and     r12, r2, #0x100          // save the result sign from r2
+  lsr     r2, #16                  // shift extracted exponents down to bit 0
+  lsr     r3, #16                  // where fnorm2 will expect them
+  push    {r0, r1, r2, r3, r12, lr}
+  mov     r0, sp                   // tell fnorm2 where to find its data
+  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
+  pop     {r0, r1, r2, r3, r12, lr}
+  lsl     r3, #16                  // shift exponents back up to bit 16
+  orr     r2, r12, r2, lsl #16     // and put the result sign back in r2
+  b       LOCAL_LABEL(mul)
+
+LOCAL_LABEL(inf_NaN):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to fnan2 which will propagate a NaN from
+  // the input; otherwise any multiplication involving infinity returns
+  // infinity, unless it's infinity * 0 which is an invalid operation and
+  // returns NaN again.
+  mov     r12, #0xFF000000
+  cmp     r12, r0, lsl #1          // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+  cmp     r12, r1, lsl #1
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+
+  // NaNs are dealt with, so now we have at least one infinity. Check if the
+  // other operand is 0. This is conveniently done by XORing the two: because
+  // we know that the low 31 bits of one operand are exactly 0x7F800000, we can
+  // test if the low 31 bits of the other one are all 0 by checking whether the
+  // low 31 bits of (x XOR y) equal 0x7F800000.
+  eor     r3, r0, r1
+  cmp     r12, r3, lsl #1          // if inf * 0, this sets Z
+  lsr     r0, r12, #1              // set up return value of +infinity
+  orrne   r0, r0, r2, lsl #23      // if not inf * 0, put on the output sign
+  orreq   r0, r0, #0x400000        // otherwise, set the 'quiet NaN' bit
+  bx      lr                       // and return
+
+END_COMPILERRT_FUNCTION(__mulsf3)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
new file mode 100644
index 0000000000000..f2ede1013a9e6
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
@@ -0,0 +1,251 @@
+//===-- mulsf3.S - single-precision floating point multiplication ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float multiplication with the
+// IEEE-754 default rounding (to nearest, ties to even), in optimized Thumb1
+// assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+
+  .syntax unified
+  .text
+  .thumb
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__mulsf3)
+  push {r4,r5,r6,lr}
+
+  // Get exponents of the inputs, and check for uncommon values. In the process
+  // of this we also compute the sign, because it's marginally quicker that
+  // way.
+  lsls    r2, r0, #1
+  adcs    r4, r4, r4    // set r4[0] to sign bit of x
+  lsls    r3, r1, #1
+  adcs    r4, r4, r3    // set r4[0] to the output sign
+  lsrs    r2, r2, #24
+  beq     LOCAL_LABEL(zerodenorm0)   // still do the next LSRS
+  lsrs    r3, r3, #24
+  beq     LOCAL_LABEL(zerodenorm)
+  cmp     r2, #255
+  beq     LOCAL_LABEL(naninf)
+  cmp     r3, #255
+  beq     LOCAL_LABEL(naninf)
+  // Compute the output exponent. We'll be generating our product _without_ the
+  // leading bit, so we subtract 0x7f rather than 0x80.
+  adds    r2, r2, r3
+  subs    r2, r2, #0x7f
+  // Blank off everything above the mantissas.
+  lsls    r0, r0, #9
+  lsls    r1, r1, #9
+LOCAL_LABEL(normalised): // we may come back here from zerodenorm
+  lsrs    r0, r0, #9
+  lsrs    r1, r1, #9
+  // Multiply. r0 and r1 are the mantissas of the inputs but without their
+  // leading bits, so the product we want in principle is P=(r0+2^23)(r1+2^23).
+  // P is at most (2^24-1)^2 < 2^48, so it fits in a word and a half.
+  //
+  // The technique below will actually compute P - 2^46, by not adding on the
+  // term where the two 2^23 are multiplied. The 48-bit result will be
+  // delivered in two output registers, one containing its bottom 32 bits and
+  // the other containing the top 32, so they overlap in the middle 16 bits.
+  // This is done using only two multiply instructions and some bookkeeping.
+  //
+  // In the comments I'll write X and Y for the original input mantissas (again
+  // without their leading bits). I'll also decompose them as X = xh + xl and
+  // Y = yh + yl, where xl and yl are in the range 0..2^8-1 and xh,yh are
+  // multiples of 2^8.
+  adds    r5, r0, r1
+  lsls    r5, r5, #7    // r5 = (X+Y) << 7
+  movs    r6, r0
+  muls    r6, r1, r6    // r6 is congruent mod 2^32 to X*Y
+  lsrs    r0, r0, #8
+  lsrs    r1, r1, #8
+  muls    r0, r1, r0
+  lsls    r1, r0, #16   // r1 is congruent mod 2^32 to xh*yh
+  subs    r3, r6, r1    // now r3 is congruent mod 2^32 to
+                        //   (X*Y) - (xh*yh) = xh*yl + xl*yh + xl*yl
+                        //   and hence, since that is at most 0xfeff0001,
+                        //   is _exactly_ equal to that
+  adds    r0, r0, r5    // r0 is now (xh*yh + (X+Y)<<23) >> 16
+  lsrs    r1, r3, #16   // r1 is the top 16 bits of r3, i.e.
+                        //   (xh*yl + xl*yh + xl*yl) >> 16
+  adds    r3, r0, r1    // now r3 equals
+                        //   (xh*yh + xh*yl + xl*yh + xl*yl + (X+Y)<<23) >> 16
+                        //   i.e. (X*Y + (X+Y)<<23) >> 16,
+                        //   i.e. (the right answer) >> 16.
+                        // Meanwhile, r6 is exactly the bottom 32 bits of the
+                        // right answer.
+  // Renormalise if necessary.
+  lsrs    r1, r3, #30
+  beq     LOCAL_LABEL(norenorm)
+  // Here we have to do something fiddly. Renormalisation would be a trivial
+  // job if we had the leading mantissa bit - just note that it's one bit
+  // position above where it should be, and shift right by one. But without
+  // that bit, we currently have (2x - 2^30), and we want (x - 2^30); just
+  // shifting right would of course give us (x - 2^29), so we must subtract an
+  // extra 2^29 to fix this up.
+  lsrs    r3, r3, #1
+  movs    r1, #1
+  lsls    r1, r1, #29
+  subs    r3, r3, r1
+  adds    r2, r2, #1
+LOCAL_LABEL(norenorm):
+  // Round and shift down to the right bit position.
+  lsrs    r0, r3, #7    // round bit goes into the carry flag
+  bcc     LOCAL_LABEL(rounded)
+  adds    r0, r0, #1
+  // In the round-up branch, we must also check if we have to round to even, by
+  // testing all the bits below the round bit. We will normally not expect to,
+  // so we do RTE by branching out of line and back again to avoid spending a
+  // branch in the common case.
+  lsls    r5, r3, #32-7+1  // check the bits shifted out of r3 above
+  bne     LOCAL_LABEL(rounded)          // if any is nonzero, we're not rounding to even
+  lsls    r5, r6, #15      // check the bottom 17 bits of the low-order 32
+                           //   (enough to overlap r3 even if we renormalised)
+  beq     LOCAL_LABEL(rte)              // if any is nonzero, fall through, else RTE
+LOCAL_LABEL(rounded):
+  // Put on the sign and exponent, check for underflow and overflow, and
+  // return.
+  //
+  // Underflow occurs iff r2 (the output exponent) <= 0. Overflow occurs if
+  // it's >= 0xFF. (Also if it's 0xFE and we rounded up to overflow, but since
+  // this code doesn't report exceptions, we can ignore this case because it'll
+  // happen to return the right answer regardless). So we handle most of this
+  // via an unsigned comparison against 0xFF, which leaves the one case of a
+  // zero exponent that we have to filter separately by testing the Z flag
+  // after we shift the exponent back up into place.
+  cmp     r2, #0xFF    // check for most over/underflows
+  bhs     LOCAL_LABEL(outflow)      // ... and branch out of line for them
+  lsls    r5, r2, #23  // shift the exponent into its output location
+  beq     LOCAL_LABEL(outflow)      // ... and branch again if it was 0
+  lsls    r4, r4, #31  // shift the output sign into place
+  orrs    r0, r0, r4   // and OR it in to the output
+  adds    r0, r0, r5   // OR in the mantissa
+  pop     {r4,r5,r6,pc} // and return
+
+LOCAL_LABEL(rte):
+  // Out-of-line handler for the round-to-even case. Clear the low mantissa bit
+  // and go back to the post-rounding code.
+  movs    r5, #1
+  bics    r0, r0, r5
+  b       LOCAL_LABEL(rounded)
+
+LOCAL_LABEL(outflow):
+  cmp     r2, #0
+  bgt     LOCAL_LABEL(overflow)
+  // To handle underflow, we construct an intermediate value in the IEEE 754
+  // style (using our existing full-length mantissa, and bias the exponent by
+  // +0xC0), and indicate whether that intermediate was rounded up, down or not
+  // at all. Then call the helper function funder, which will denormalise and
+  // re-round correctly.
+  lsls    r1, r0, #7    // shift up the post-rounding mantissa
+  subs    r1, r3, r1    //   and subtract it from the pre-rounding version
+  lsls    r6, r6, #15
+  cmp     r6, #1        // if the rest of the low bits are nonzero
+  adcs    r1, r1, r1    //   then set an extra bit at the bottom
+
+  lsls    r4, r4, #31
+  orrs    r0, r0, r4    // put on the sign
+  adds    r2, r2, #192  // bias the exponent
+  lsls    r3, r2, #23
+  adds    r0, r0, r3    // put on the biased exponent
+
+  bl      SYMBOL_NAME(__compiler_rt_funder)
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(overflow):
+  // Handle overflow by returning an infinity of the correct sign.
+  lsls    r4, r4, #8    // move the sign up to bit 8
+  movs    r0, #0xff
+  orrs    r0, r0, r4    // fill in an exponent just below it
+  lsls    r0, r0, #23   // and shift those 9 bits up to the top of the word
+  pop     {r4,r5,r6,pc}
+
+  // We come here if there's at least one zero or denormal. On the fast path
+  // above, it was convenient to check these before checking NaNs and
+  // infinities, but NaNs take precedence, so now we're off the fast path, we
+  // must still check for those.
+  //
+  // At the main entry point 'zerodenorm' we want r2 and r3 to be the two input
+  // exponents. So if we branched after shifting-and-checking r2, we come to
+  // this earlier entry point 'zerodenorm0' so that we still shift r3.
+LOCAL_LABEL(zerodenorm0):
+  lsrs    r3, r3, #24
+LOCAL_LABEL(zerodenorm):
+  cmp     r2, #255
+  beq     LOCAL_LABEL(naninf)
+  cmp     r3, #255
+  beq     LOCAL_LABEL(naninf)
+  // Now we know we have at least one zero or denormal, and no NaN or infinity.
+  // Check if either input is actually zero. We've ruled out 0 * infinity by
+  // this point, so any zero input means we return zero of the correct sign.
+  lsls    r6, r0, #1        // is one input zero?
+  beq     LOCAL_LABEL(zero)              // yes, go and return zero
+  lsls    r6, r1, #1        // is the other one zero?
+  bne     LOCAL_LABEL(denorm)            // if not, one must have been a denormal
+LOCAL_LABEL(zero):
+  lsls    r0, r4, #31    // shift up the output sign to make the return value
+  pop     {r4,r5,r6,pc}
+
+  // Handle denormals via the helper function fnorm2, which will break both
+  // inputs up into mantissa and exponent, renormalising and generating a
+  // negative exponent if necessary.
+LOCAL_LABEL(denorm):
+  push    {r0,r1,r2,r3}
+  mov     r0, sp
+  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
+  pop     {r0,r1,r2,r3}
+  // Convert fnorm2's return values into the right form to rejoin the main
+  // code path.
+  lsls    r0, r0, #1
+  lsls    r1, r1, #1
+  adds    r2, r2, r3
+  subs    r2, r2, #0x7f
+  b       LOCAL_LABEL(normalised)
+
+  // We come here if at least one input is a NaN or infinity. There may still
+  // be zeroes (or denormals, though they make no difference at this stage).
+LOCAL_LABEL(naninf):
+  movs    r6, #0xff
+  lsls    r6, r6, #24
+  lsls    r5, r0, #1
+  cmp     r5, r6
+  bhi     LOCAL_LABEL(nan)              // first operand is a NaN
+  lsls    r5, r1, #1
+  cmp     r5, r6
+  bhi     LOCAL_LABEL(nan)              // second operand is a NaN
+
+  // We know we have at least one infinity, and no NaNs. We might also have a
+  // zero, in which case we return the default quiet NaN.
+  lsls    r6, r0, #1
+  beq     LOCAL_LABEL(infzero)          // if r0 is a zero, r1 must be inf
+  lsls    r6, r1, #1
+  beq     LOCAL_LABEL(infzero)          // if r1 is a zero, r0 must be inf
+  // Otherwise we have infinity * infinity, or infinity * finite. Just return
+  // an appropriately signed infinity.
+  b       LOCAL_LABEL(overflow)         // reuse the code there
+
+  // We come here if at least one input is a NaN. Hand off to fnan2, which
+  // propagates an appropriate NaN to the output, dealing with the special
+  // cases of signalling/quiet NaNs.
+LOCAL_LABEL(nan):
+  bl      SYMBOL_NAME(__compiler_rt_fnan2)
+  pop     {r4,r5,r6,pc}
+
+  // Return a quiet NaN as the result of infinity * zero.
+LOCAL_LABEL(infzero):
+  ldr     r0, =0x7fc00000
+  pop     {r4,r5,r6,pc}
+
+END_COMPILERRT_FUNCTION(__mulsf3)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt
index 63f4c94605c90..8e3cb35183ba7 100644
--- a/compiler-rt/test/builtins/CMakeLists.txt
+++ b/compiler-rt/test/builtins/CMakeLists.txt
@@ -35,6 +35,10 @@ if(APPLE)
   darwin_filter_host_archs(BUILTIN_SUPPORTED_ARCH BUILTIN_TEST_ARCH)
 endif()
 
+if(COMPILER_RT_ARM_OPTIMIZED_FP)
+  list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_ARM_OPTIMIZED_FP)
+endif()
+
 foreach(arch ${BUILTIN_TEST_ARCH})
   set(BUILTINS_TEST_TARGET_ARCH ${arch})
   string(TOLOWER "-${arch}-${OS_NAME}" BUILTINS_TEST_CONFIG_SUFFIX)
diff --git a/compiler-rt/test/builtins/Unit/divsf3_test.c b/compiler-rt/test/builtins/Unit/divsf3_test.c
index f8cb6169ac283..12c5df5fdaae1 100644
--- a/compiler-rt/test/builtins/Unit/divsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/divsf3_test.c
@@ -1,115 +1,428 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 // REQUIRES: librt_has_divsf3
 
 #include "int_lib.h"
+#include <inttypes.h>
 #include <stdio.h>
 
 #include "fp_test.h"
 
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more
+// detailed handling of NaNs, we tighten up the check and include some extra
+// test cases specific to that NaN policy.
+#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
 // Returns: a / b
 COMPILER_RT_ABI float __divsf3(float a, float b);
 
-int test__divsf3(float a, float b, uint32_t expected)
-{
-    float x = __divsf3(a, b);
-    int ret = compareResultF(x, expected);
+int test__divsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+  float a = fromRep32(a_rep), b = fromRep32(b_rep);
+  float x = __divsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep32(x) == expected_rep;
+#else
+  int ret = compareResultF(x, expected_rep);
+#endif
 
-    if (ret){
-        printf("error in test__divsf3(%.20e, %.20e) = %.20e, "
-               "expected %.20e\n", a, b, x,
-               fromRep32(expected));
-    }
-    return ret;
+  if (ret) {
+    printf("error in test__divsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+           ", expected %08" PRIx32 "\n",
+           a_rep, b_rep, toRep32(x), expected_rep);
+  }
+  return ret;
 }
 
-int main()
-{
-    // Returned NaNs are assumed to be qNaN by default
-
-    // qNaN / any = qNaN
-    if (test__divsf3(makeQNaN32(), 3.F, UINT32_C(0x7fc00000)))
-      return 1;
-    // NaN / any = NaN
-    if (test__divsf3(makeNaN32(UINT32_C(0x123)), 3.F, UINT32_C(0x7fc00000)))
-      return 1;
-    // any / qNaN = qNaN
-    if (test__divsf3(3.F, makeQNaN32(), UINT32_C(0x7fc00000)))
-      return 1;
-    // any / NaN = NaN
-    if (test__divsf3(3.F, makeNaN32(UINT32_C(0x123)), UINT32_C(0x7fc00000)))
-      return 1;
-
-    // +Inf / positive = +Inf
-    if (test__divsf3(makeInf32(), 3.F, UINT32_C(0x7f800000)))
-      return 1;
-    // +Inf / negative = -Inf
-    if (test__divsf3(makeInf32(), -3.F, UINT32_C(0xff800000)))
-      return 1;
-    // -Inf / positive = -Inf
-    if (test__divsf3(makeNegativeInf32(), 3.F, UINT32_C(0xff800000)))
-      return 1;
-    // -Inf / negative = +Inf
-    if (test__divsf3(makeNegativeInf32(), -3.F, UINT32_C(0x7f800000)))
-      return 1;
-
-    // Inf / Inf = NaN
-    if (test__divsf3(makeInf32(), makeInf32(), UINT32_C(0x7fc00000)))
-      return 1;
-    // 0.0 / 0.0 = NaN
-    if (test__divsf3(+0x0.0p+0F, +0x0.0p+0F, UINT32_C(0x7fc00000)))
-      return 1;
-    // +0.0 / +Inf = +0.0
-    if (test__divsf3(+0x0.0p+0F, makeInf32(), UINT32_C(0x0)))
-      return 1;
-    // +Inf / +0.0 = +Inf
-    if (test__divsf3(makeInf32(), +0x0.0p+0F, UINT32_C(0x7f800000)))
-      return 1;
-
-    // positive / +0.0 = +Inf
-    if (test__divsf3(+1.F, +0x0.0p+0F, UINT32_C(0x7f800000)))
-      return 1;
-    // positive / -0.0 = -Inf
-    if (test__divsf3(+1.F, -0x0.0p+0F, UINT32_C(0xff800000)))
-      return 1;
-    // negative / +0.0 = -Inf
-    if (test__divsf3(-1.F, +0x0.0p+0F, UINT32_C(0xff800000)))
-      return 1;
-    // negative / -0.0 = +Inf
-    if (test__divsf3(-1.F, -0x0.0p+0F, UINT32_C(0x7f800000)))
-      return 1;
-
-    // 1/3
-    if (test__divsf3(1.F, 3.F, UINT32_C(0x3eaaaaab)))
-      return 1;
-    // smallest normal result
-    if (test__divsf3(0x1.0p-125F, 2.F, UINT32_C(0x00800000)))
-      return 1;
+int main(void) {
+  int status = 0;
 
-    // divisor is exactly 1.0
-    if (test__divsf3(0x1.0p+0F, 0x1.0p+0F, UINT32_C(0x3f800000)))
-      return 1;
-    // divisor is truncated to exactly 1.0 in UQ1.15
-    if (test__divsf3(0x1.0p+0F, 0x1.0001p+0F, UINT32_C(0x3f7fff00)))
-      return 1;
+  status |= test__divsf3(0x00000000, 0x00000001, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x007fffff, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x00800000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x00ffffff, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x3f800000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x40a00000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x7effffff, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x7f000000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x80000002, 0x80000000);
+  status |= test__divsf3(0x00000000, 0x807fffff, 0x80000000);
+  status |= test__divsf3(0x00000000, 0x80800001, 0x80000000);
+  status |= test__divsf3(0x00000000, 0x81000000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xc0400000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xc0e00000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xfe7fffff, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xff000000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00000001, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x00000001, 0x3e000000, 0x00000008);
+  status |= test__divsf3(0x00000001, 0x3f000000, 0x00000002);
+  status |= test__divsf3(0x00000001, 0x40000000, 0x00000000);
+  status |= test__divsf3(0x00000001, 0x7f7fffff, 0x00000000);
+  status |= test__divsf3(0x00000001, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x00000001, 0xc0000000, 0x80000000);
+  status |= test__divsf3(0x00000001, 0xff7fffff, 0x80000000);
+  status |= test__divsf3(0x00000002, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x00000002, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00000009, 0x41100000, 0x00000001);
+  status |= test__divsf3(0x00000009, 0xc1100000, 0x80000001);
+  status |= test__divsf3(0x007ffff7, 0x3f7ffffe, 0x007ffff8);
+  status |= test__divsf3(0x007ffffe, 0x3f7ffffe, 0x007fffff);
+  status |= test__divsf3(0x007fffff, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x007fffff, 0x3b000000, 0x04fffffe);
+  status |= test__divsf3(0x007fffff, 0x3f000000, 0x00fffffe);
+  status |= test__divsf3(0x007fffff, 0x3f800000, 0x007fffff);
+  status |= test__divsf3(0x007fffff, 0x3f800002, 0x007ffffd);
+  status |= test__divsf3(0x007fffff, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x007fffff, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x007fffff, 0xbf800000, 0x807fffff);
+  status |= test__divsf3(0x007fffff, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00800000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x00800000, 0x3f800001, 0x007fffff);
+  status |= test__divsf3(0x00800000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x00800001, 0x3f800002, 0x007fffff);
+  status |= test__divsf3(0x00800001, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x00800001, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00800002, 0x3f800006, 0x007ffffc);
+  status |= test__divsf3(0x00fffffe, 0x40000000, 0x007fffff);
+  status |= test__divsf3(0x00ffffff, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x00ffffff, 0x40000000, 0x00800000);
+  status |= test__divsf3(0x00ffffff, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x01000000, 0x00800000, 0x40000000);
+  status |= test__divsf3(0x01000000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x01000000, 0xc0000000, 0x80800000);
+  status |= test__divsf3(0x01000000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x01000001, 0x00800001, 0x40000000);
+  status |= test__divsf3(0x01000001, 0xc0000000, 0x80800001);
+  status |= test__divsf3(0x01000003, 0x80800003, 0xc0000000);
+  status |= test__divsf3(0x01000003, 0xc0000000, 0x80800003);
+  status |= test__divsf3(0x3f7ffff7, 0x3f7ffffb, 0x3f7ffffc);
+  status |= test__divsf3(0x3f7ffff7, 0x3f7ffffe, 0x3f7ffff9);
+  status |= test__divsf3(0x3f7ffff8, 0x3f7ffffc, 0x3f7ffffc);
+  status |= test__divsf3(0x3f7ffff8, 0x3f7ffffd, 0x3f7ffffb);
+  status |= test__divsf3(0x3f7ffffa, 0x3f7ffff9, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffb, 0x3f7ffff9, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7ffff9, 0x3f800002);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7ffffd, 0x3f7fffff);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7ffffe, 0x3f7ffffe);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7fffff, 0x3f7ffffd);
+  status |= test__divsf3(0x3f7ffffc, 0x3f800001, 0x3f7ffffa);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7ffff9, 0x3f800002);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7ffffc, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7ffffe, 0x3f7fffff);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7fffff, 0x3f7ffffe);
+  status |= test__divsf3(0x3f7ffffd, 0x3f800001, 0x3f7ffffb);
+  status |= test__divsf3(0x3f7ffffd, 0x3f800002, 0x3f7ffff9);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7ffff9, 0x3f800003);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7ffffc, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7ffffd, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7fffff, 0x3f7fffff);
+  status |= test__divsf3(0x3f7ffffe, 0x3f800001, 0x3f7ffffc);
+  status |= test__divsf3(0x3f7ffffe, 0x3f800002, 0x3f7ffffa);
+  status |= test__divsf3(0x3f7ffffe, 0x3f800003, 0x3f7ffff8);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffff9, 0x3f800003);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffffc, 0x3f800002);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffffd, 0x3f800001);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffffe, 0x3f800001);
+  status |= test__divsf3(0x3f7fffff, 0x3f800001, 0x3f7ffffd);
+  status |= test__divsf3(0x3f7fffff, 0x3f800002, 0x3f7ffffb);
+  status |= test__divsf3(0x3f7fffff, 0x3f800003, 0x3f7ffff9);
+  status |= test__divsf3(0x3f7fffff, 0x3f800004, 0x3f7ffff7);
+  status |= test__divsf3(0x3f800000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x3f800000, 0x3f7ffff7, 0x3f800005);
+  status |= test__divsf3(0x3f800000, 0x3f7ffff8, 0x3f800004);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffb, 0x3f800003);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffc, 0x3f800002);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffd, 0x3f800002);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffe, 0x3f800001);
+  status |= test__divsf3(0x3f800000, 0x3f7fffff, 0x3f800001);
+  status |= test__divsf3(0x3f800000, 0x3f800000, 0x3f800000);
+  status |= test__divsf3(0x3f800000, 0x3f800001, 0x3f7ffffe);
+  status |= test__divsf3(0x3f800000, 0x3f800002, 0x3f7ffffc);
+  status |= test__divsf3(0x3f800000, 0x3f800003, 0x3f7ffffa);
+  status |= test__divsf3(0x3f800000, 0x3f800004, 0x3f7ffff8);
+  status |= test__divsf3(0x3f800000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x3f800001, 0x3f7ffffb, 0x3f800004);
+  status |= test__divsf3(0x3f800001, 0x3f7ffffd, 0x3f800003);
+  status |= test__divsf3(0x3f800001, 0x3f7ffffe, 0x3f800002);
+  status |= test__divsf3(0x3f800001, 0x3f7fffff, 0x3f800002);
+  status |= test__divsf3(0x3f800001, 0x3f800002, 0x3f7ffffe);
+  status |= test__divsf3(0x3f800001, 0x3f800003, 0x3f7ffffc);
+  status |= test__divsf3(0x3f800002, 0x3f7ffffc, 0x3f800004);
+  status |= test__divsf3(0x3f800002, 0x3f7ffffd, 0x3f800004);
+  status |= test__divsf3(0x3f800002, 0x3f7ffffe, 0x3f800003);
+  status |= test__divsf3(0x3f800002, 0x3f7fffff, 0x3f800003);
+  status |= test__divsf3(0x3f800002, 0x3f800001, 0x3f800001);
+  status |= test__divsf3(0x3f800002, 0x3f800003, 0x3f7ffffe);
+  status |= test__divsf3(0x3f800003, 0x3f7ffffd, 0x3f800005);
+  status |= test__divsf3(0x3f800003, 0x3f7ffffe, 0x3f800004);
+  status |= test__divsf3(0x3f800003, 0x3f7fffff, 0x3f800004);
+  status |= test__divsf3(0x3f800003, 0x3f800001, 0x3f800002);
+  status |= test__divsf3(0x3f800004, 0x3f7ffffe, 0x3f800005);
+  status |= test__divsf3(0x3f800004, 0x3f800001, 0x3f800003);
+  status |= test__divsf3(0x3f800004, 0x3f800007, 0x3f7ffffa);
+  status |= test__divsf3(0x3f800005, 0x3f7fffff, 0x3f800006);
+  status |= test__divsf3(0x3f800006, 0x3f800008, 0x3f7ffffc);
+  status |= test__divsf3(0x3f800007, 0x3f800002, 0x3f800005);
+  status |= test__divsf3(0x3f800009, 0x3f800008, 0x3f800001);
+  status |= test__divsf3(0x40000000, 0x3f800000, 0x40000000);
+  status |= test__divsf3(0x40000000, 0xbf800000, 0xc0000000);
+  status |= test__divsf3(0x40400000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x40400000, 0xc0400000, 0xbf800000);
+  status |= test__divsf3(0x40400000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x40a00000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x40a00000, 0x40a00000, 0x3f800000);
+  status |= test__divsf3(0x40a00000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x40e00000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x40e00000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x41000000, 0x40000000, 0x40800000);
+  status |= test__divsf3(0x41100000, 0x40400000, 0x40400000);
+  status |= test__divsf3(0x7b000000, 0x05000000, 0x7f800000);
+  status |= test__divsf3(0x7e7fffff, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x7efffffd, 0xc0000000, 0xfe7ffffd);
+  status |= test__divsf3(0x7effffff, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x7effffff, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x7f000000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x7f000000, 0x007fffff, 0x7f800000);
+  status |= test__divsf3(0x7f000000, 0x3f000000, 0x7f800000);
+  status |= test__divsf3(0x7f000000, 0x40000000, 0x7e800000);
+  status |= test__divsf3(0x7f000000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x7f000000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x7f000000, 0xbf000000, 0xff800000);
+  status |= test__divsf3(0x7f000000, 0xc0000000, 0xfe800000);
+  status |= test__divsf3(0x7f000000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x7f000003, 0xfe800003, 0xc0000000);
+  status |= test__divsf3(0x7f7ffffd, 0x40800000, 0x7e7ffffd);
+  status |= test__divsf3(0x7f7ffffd, 0xc0800000, 0xfe7ffffd);
+  status |= test__divsf3(0x7f7fffff, 0x00000001, 0x7f800000);
+  status |= test__divsf3(0x7f7fffff, 0x3f7fffff, 0x7f800000);
+  status |= test__divsf3(0x7f7fffff, 0x7e7fffff, 0x40800000);
+  status |= test__divsf3(0x7f7fffff, 0x7effffff, 0x40000000);
+  status |= test__divsf3(0x7f7fffff, 0xc0000000, 0xfeffffff);
+  status |= test__divsf3(0x7f7fffff, 0xfe7fffff, 0xc0800000);
+  status |= test__divsf3(0x7f7fffff, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x7f800000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x00000001, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x007fffff, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x00800000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x00ffffff, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x3f800000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x40a00000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x7effffff, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x7f000000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x80000002, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x807fffff, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x80800001, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x81000000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xc0400000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xc0e00000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xfe7fffff, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xff000000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xff7fffff, 0xff800000);
+  status |= test__divsf3(0x80000000, 0x00000003, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x007fffff, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x00800001, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x01000000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x40000000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x40c00000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x7e7fffff, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x7e800000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x80000004, 0x00000000);
+  status |= test__divsf3(0x80000000, 0x807fffff, 0x00000000);
+  status |= test__divsf3(0x80000000, 0x80800000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0x80ffffff, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xc0800000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xc1000000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xfe800000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xfeffffff, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x80000001, 0x3f000000, 0x80000002);
+  status |= test__divsf3(0x80000001, 0x40000000, 0x80000000);
+  status |= test__divsf3(0x80000001, 0x7f7fffff, 0x80000000);
+  status |= test__divsf3(0x80000001, 0xc0000000, 0x00000000);
+  status |= test__divsf3(0x80000001, 0xff7fffff, 0x00000000);
+  status |= test__divsf3(0x80000003, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x80000003, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x80000004, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x80000004, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x807ffff8, 0x3f7ffffe, 0x807ffff9);
+  status |= test__divsf3(0x807fffff, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x807fffff, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x807fffff, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x807fffff, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x80800000, 0x3f800001, 0x807fffff);
+  status |= test__divsf3(0x80800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x80800000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x80800001, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x80800001, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x80ffffff, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x80ffffff, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x81000000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x81000000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x81000001, 0x00800001, 0xc0000000);
+  status |= test__divsf3(0x81000005, 0x00800005, 0xc0000000);
+  status |= test__divsf3(0xbf800000, 0x3f800000, 0xbf800000);
+  status |= test__divsf3(0xbf800000, 0xbf800000, 0x3f800000);
+  status |= test__divsf3(0xc0000000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xc0000000, 0x3f800000, 0xc0000000);
+  status |= test__divsf3(0xc0000000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xc0000000, 0xbf800000, 0x40000000);
+  status |= test__divsf3(0xc0800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xc0800000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xc0c00000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xc0c00000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xc0c00000, 0xc0400000, 0x40000000);
+  status |= test__divsf3(0xc0e00000, 0x40e00000, 0xbf800000);
+  status |= test__divsf3(0xc1000000, 0x40000000, 0xc0800000);
+  status |= test__divsf3(0xc1000000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xc1000000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xc1100000, 0xc0400000, 0x40400000);
+  status |= test__divsf3(0xfe7fffff, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xfe7fffff, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xfe800000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xfe800000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xfe800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xfe800000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xfeffffff, 0x40000000, 0xfe7fffff);
+  status |= test__divsf3(0xfeffffff, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xff000000, 0x3f000000, 0xff800000);
+  status |= test__divsf3(0xff000000, 0xbf000000, 0x7f800000);
+  status |= test__divsf3(0xff000001, 0x7e800001, 0xc0000000);
+  status |= test__divsf3(0xff7ffffd, 0x40800000, 0xfe7ffffd);
+  status |= test__divsf3(0xff7ffffd, 0xc0800000, 0x7e7ffffd);
+  status |= test__divsf3(0xff7fffff, 0x7e7fffff, 0xc0800000);
+  status |= test__divsf3(0xff7fffff, 0xfe7fffff, 0x40800000);
+  status |= test__divsf3(0xff7fffff, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xff800000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x00000003, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x007fffff, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x00800001, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x01000000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x40000000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x40c00000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x7e800000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x80000004, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x807fffff, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x80800000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x80ffffff, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xc0800000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xc1000000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xfe800000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xff7fffff, 0x7f800000);
+  status |= test__divsf3(0x2cbed883, 0x333f6113, 0x38ff4953);
+  status |= test__divsf3(0x3f87ffff, 0x7f001000, 0x0043f781);
 
-    // smallest normal value divided by 2.0
-    if (test__divsf3(0x1.0p-126F, 2.0F, UINT32_C(0x00400000)))
-      return 1;
-    // smallest subnormal result
-    if (test__divsf3(0x1.0p-126F, 0x1p+23F, UINT32_C(0x00000001)))
-      return 1;
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+  // which causes compareResultF to accept any NaN encoding. We also use the
+  // same value as the input NaN in tests that have one, so that even in
+  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+  // still the exact expected NaN.
+  status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000);
+  status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000);
+  status |= test__divsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+  status |= test__divsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+  status |= test__divsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
 
-    // some misc test cases obtained by fuzzing against h/w implementation
-    if (test__divsf3(-0x1.3e75e6p-108F, -0x1.cf372p+38F, UINT32_C(0x00000006)))
-      return 1;
-    if (test__divsf3(0x1.e77c54p+81F, -0x1.e77c52p-47F, UINT32_C(0xff800000)))
-      return 1;
-    if (test__divsf3(0x1.fffffep-126F, 2.F, UINT32_C(0x00800000)))
-      return 1;
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/divsf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7fc00000.
 
-    // test 1 / (1 - eps(0.5)) = 1 + eps(1)
-    if (test__divsf3(1.0F, 0x1.fffffep-1F, UINT32_C(0x3f800001)))
-      return 1;
+  status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+  status |= test__divsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+  status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+  status |= test__divsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+  status |= test__divsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+  status |= test__divsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+  status |= test__divsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+  status |= test__divsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+  status |= test__divsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+  status |= test__divsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+  status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+  status |= test__divsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+  status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000);
+  status |= test__divsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+  status |= test__divsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+  status |= test__divsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+  status |= test__divsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+  status |= test__divsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+  status |= test__divsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+  status |= test__divsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+  status |= test__divsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+  status |= test__divsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+  status |= test__divsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+  status |= test__divsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+  status |= test__divsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+  status |= test__divsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+  status |= test__divsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+  status |= test__divsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+  status |= test__divsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+  status |= test__divsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+  status |= test__divsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+  status |= test__divsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+  status |= test__divsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+  status |= test__divsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+  status |= test__divsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+  status |= test__divsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+  status |= test__divsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+  status |= test__divsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+  status |= test__divsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+  status |= test__divsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+  status |= test__divsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+  status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+  status |= test__divsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+  status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+  status |= test__divsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+  status |= test__divsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+  status |= test__divsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+  status |= test__divsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+  status |= test__divsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+  status |= test__divsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+  status |= test__divsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+  status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+  status |= test__divsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+  status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000);
+#endif // ARM_NAN_HANDLING
 
-    return 0;
+  return status;
 }
diff --git a/compiler-rt/test/builtins/Unit/mulsf3_test.c b/compiler-rt/test/builtins/Unit/mulsf3_test.c
new file mode 100644
index 0000000000000..7dc7c8ad39c32
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/mulsf3_test.c
@@ -0,0 +1,616 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_mulsf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more
+// detailed handling of NaNs, we tighten up the check and include some extra
+// test cases specific to that NaN policy.
+#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
+// Returns: a * b
+COMPILER_RT_ABI float __mulsf3(float a, float b);
+
+int test__mulsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+  float a = fromRep32(a_rep), b = fromRep32(b_rep);
+  float x = __mulsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep32(x) == expected_rep;
+#else
+  int ret = compareResultF(x, expected_rep);
+#endif
+
+  if (ret) {
+    printf("error in test__mulsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+           ", expected %08" PRIx32 "\n",
+           a_rep, b_rep, toRep32(x), expected_rep);
+  }
+  return ret;
+}
+
+int main(void) {
+  int status = 0;
+
+  status |= test__mulsf3(0x00000000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x007fffff, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x00ffffff, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x3f800000, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x7effffff, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x80000000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x80000002, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x807fffff, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x80800001, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x81000000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xc0400000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xfe7fffff, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xff000000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xff7fffff, 0x80000000);
+  status |= test__mulsf3(0x00000001, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x00000001, 0x00000001, 0x00000000);
+  status |= test__mulsf3(0x00000001, 0x3f000000, 0x00000000);
+  status |= test__mulsf3(0x00000001, 0x3f7fffff, 0x00000001);
+  status |= test__mulsf3(0x00000001, 0x3f800000, 0x00000001);
+  status |= test__mulsf3(0x00000001, 0x40000000, 0x00000002);
+  status |= test__mulsf3(0x00000001, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x00000001, 0xbf7fffff, 0x80000001);
+  status |= test__mulsf3(0x00000006, 0x3f000000, 0x00000003);
+  status |= test__mulsf3(0x00000006, 0xbf000000, 0x80000003);
+  status |= test__mulsf3(0x00000008, 0x3e000000, 0x00000001);
+  status |= test__mulsf3(0x007ffff7, 0x81000003, 0x80000000);
+  status |= test__mulsf3(0x007ffff8, 0x3f800001, 0x007ffff9);
+  status |= test__mulsf3(0x007ffff8, 0x3f800008, 0x00800000);
+  status |= test__mulsf3(0x007ffff8, 0xbf800001, 0x807ffff9);
+  status |= test__mulsf3(0x007ffff8, 0xbf800008, 0x80800000);
+  status |= test__mulsf3(0x007ffffc, 0x40000000, 0x00fffff8);
+  status |= test__mulsf3(0x007ffffe, 0x3f7ffffc, 0x007ffffc);
+  status |= test__mulsf3(0x007ffffe, 0x3f800001, 0x007fffff);
+  status |= test__mulsf3(0x007ffffe, 0xbf800001, 0x807fffff);
+  status |= test__mulsf3(0x007fffff, 0x007ffffe, 0x00000000);
+  status |= test__mulsf3(0x007fffff, 0x3f800001, 0x00800000);
+  status |= test__mulsf3(0x007fffff, 0x40000000, 0x00fffffe);
+  status |= test__mulsf3(0x00800000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x00800000, 0x00800000, 0x00000000);
+  status |= test__mulsf3(0x00800000, 0x3f7ffffe, 0x007fffff);
+  status |= test__mulsf3(0x00800000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x00800000, 0x80800000, 0x80000000);
+  status |= test__mulsf3(0x00800000, 0xc0000000, 0x81000000);
+  status |= test__mulsf3(0x00800001, 0x3f7ffffa, 0x007ffffe);
+  status |= test__mulsf3(0x00800001, 0x3f7ffffe, 0x00800000);
+  status |= test__mulsf3(0x00800001, 0xc0000000, 0x81000001);
+  status |= test__mulsf3(0x00800002, 0x3f7ffffc, 0x00800000);
+  status |= test__mulsf3(0x00fffff8, 0x3f000000, 0x007ffffc);
+  status |= test__mulsf3(0x00fffffe, 0x3f000000, 0x007fffff);
+  status |= test__mulsf3(0x00fffffe, 0xbf000000, 0x807fffff);
+  status |= test__mulsf3(0x00ffffff, 0x3f000000, 0x00800000);
+  status |= test__mulsf3(0x00ffffff, 0xbf000000, 0x80800000);
+  status |= test__mulsf3(0x3f000000, 0x80000001, 0x80000000);
+  status |= test__mulsf3(0x3f800000, 0x007ffffd, 0x007ffffd);
+  status |= test__mulsf3(0x3f800000, 0x01000003, 0x01000003);
+  status |= test__mulsf3(0x3f800000, 0x3f800000, 0x3f800000);
+  status |= test__mulsf3(0x3f800000, 0x40000000, 0x40000000);
+  status |= test__mulsf3(0x3f800000, 0x80000001, 0x80000001);
+  status |= test__mulsf3(0x3f800000, 0x80000009, 0x80000009);
+  status |= test__mulsf3(0x3f800001, 0x3f800001, 0x3f800002);
+  status |= test__mulsf3(0x3f800001, 0xbf800001, 0xbf800002);
+  status |= test__mulsf3(0x3f800001, 0xbf800002, 0xbf800003);
+  status |= test__mulsf3(0x3f800002, 0x3f800001, 0x3f800003);
+  status |= test__mulsf3(0x3f800002, 0x7f7ffffe, 0x7f800000);
+  status |= test__mulsf3(0x3f800001, 0x7f7ffffe, 0x7f800000);
+  status |= test__mulsf3(0x40000000, 0x00800000, 0x01000000);
+  status |= test__mulsf3(0x40000000, 0x00800001, 0x01000001);
+  status |= test__mulsf3(0x40000000, 0x3f800000, 0x40000000);
+  status |= test__mulsf3(0x40000000, 0x40400000, 0x40c00000);
+  status |= test__mulsf3(0x40000000, 0x7e800000, 0x7f000000);
+  status |= test__mulsf3(0x40000000, 0x7effffff, 0x7f7fffff);
+  status |= test__mulsf3(0x40000000, 0x807ffffd, 0x80fffffa);
+  status |= test__mulsf3(0x40000000, 0x80800003, 0x81000003);
+  status |= test__mulsf3(0x40000000, 0x80800005, 0x81000005);
+  status |= test__mulsf3(0x40000000, 0xbf800000, 0xc0000000);
+  status |= test__mulsf3(0x40000000, 0xfe7ffffd, 0xfefffffd);
+  status |= test__mulsf3(0x40000000, 0xfe800003, 0xff000003);
+  status |= test__mulsf3(0x403fffff, 0x3f7ffffd, 0x403ffffd);
+  status |= test__mulsf3(0x403fffff, 0x3f7ffffe, 0x403ffffe);
+  status |= test__mulsf3(0x403fffff, 0x3f7fffff, 0x403ffffe);
+  status |= test__mulsf3(0x403fffff, 0xbf7ffffd, 0xc03ffffd);
+  status |= test__mulsf3(0x40400000, 0x00000002, 0x00000006);
+  status |= test__mulsf3(0x40400000, 0x40000000, 0x40c00000);
+  status |= test__mulsf3(0x40400000, 0x40400000, 0x41100000);
+  status |= test__mulsf3(0x40400000, 0xc0000000, 0xc0c00000);
+  status |= test__mulsf3(0x40400001, 0x3f800001, 0x40400003);
+  status |= test__mulsf3(0x40400001, 0x3f800003, 0x40400006);
+  status |= test__mulsf3(0x40400001, 0xbf800003, 0xc0400006);
+  status |= test__mulsf3(0x40800000, 0x00000002, 0x00000008);
+  status |= test__mulsf3(0x40800000, 0x7e7fffff, 0x7f7fffff);
+  status |= test__mulsf3(0x40800000, 0xfe7fffff, 0xff7fffff);
+  status |= test__mulsf3(0x409fffff, 0x3f7fffff, 0x409ffffe);
+  status |= test__mulsf3(0x40a00000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x40a00000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x40a00001, 0x3f800001, 0x40a00002);
+  status |= test__mulsf3(0x40dfffff, 0x3f7ffffc, 0x40dffffc);
+  status |= test__mulsf3(0x40dfffff, 0x3f7fffff, 0x40dffffe);
+  status |= test__mulsf3(0x40e00000, 0x80000000, 0x80000000);
+  status |= test__mulsf3(0x40e00000, 0xff800000, 0xff800000);
+  status |= test__mulsf3(0x40e00001, 0x3f800001, 0x40e00003);
+  status |= test__mulsf3(0x7e7ffffd, 0x40800000, 0x7f7ffffd);
+  status |= test__mulsf3(0x7e7ffffd, 0xc0800000, 0xff7ffffd);
+  status |= test__mulsf3(0x7e800000, 0xc0000000, 0xff000000);
+  status |= test__mulsf3(0x7efffffd, 0xc0000008, 0xff800000);
+  status |= test__mulsf3(0x7effffff, 0xc0000000, 0xff7fffff);
+  status |= test__mulsf3(0x7f000000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x7f000000, 0x40000000, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0x7f000000, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0x7f7ffffe, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0xfe800000, 0xff800000);
+  status |= test__mulsf3(0x7f000000, 0xfe800004, 0xff800000);
+  status |= test__mulsf3(0x7f000000, 0xff000000, 0xff800000);
+  status |= test__mulsf3(0x7f000009, 0x7f7ffffa, 0x7f800000);
+  status |= test__mulsf3(0x7f000009, 0xc0c00002, 0xff800000);
+  status |= test__mulsf3(0x7f7fffff, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x7f800000, 0x007fffff, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x00ffffff, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x3f800000, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x7effffff, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x80000002, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0x807fffff, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0x80800001, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0x81000000, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xc0400000, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xff000000, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xff7fffff, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xff800000, 0xff800000);
+  status |= test__mulsf3(0x80000000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x80000000, 0x40c00000, 0x80000000);
+  status |= test__mulsf3(0x80000000, 0x7f7fffff, 0x80000000);
+  status |= test__mulsf3(0x80000000, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0x80000004, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0x80800000, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0xc1000000, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0xfe800000, 0x00000000);
+  status |= test__mulsf3(0x80000001, 0x00000001, 0x80000000);
+  status |= test__mulsf3(0x80000001, 0x40a00000, 0x80000005);
+  status |= test__mulsf3(0x80000002, 0x3f800000, 0x80000002);
+  status |= test__mulsf3(0x80000003, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x80000003, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0x80000004, 0xbf800000, 0x00000004);
+  status |= test__mulsf3(0x80000008, 0x3e000000, 0x80000001);
+  status |= test__mulsf3(0x807ffff7, 0x01000003, 0x80000000);
+  status |= test__mulsf3(0x807ffff7, 0x3f800001, 0x807ffff8);
+  status |= test__mulsf3(0x807ffffd, 0xc0000000, 0x00fffffa);
+  status |= test__mulsf3(0x807fffff, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x807fffff, 0x3f800001, 0x80800000);
+  status |= test__mulsf3(0x807fffff, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0x807fffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0x807fffff, 0x807ffffe, 0x00000000);
+  status |= test__mulsf3(0x807fffff, 0xbf800000, 0x007fffff);
+  status |= test__mulsf3(0x807fffff, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0x80800000, 0x00800000, 0x80000000);
+  status |= test__mulsf3(0x80800000, 0x80800000, 0x00000000);
+  status |= test__mulsf3(0x80800001, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x80800001, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0x80800001, 0xbf800000, 0x00800001);
+  status |= test__mulsf3(0x80fffffc, 0x3f000000, 0x807ffffe);
+  status |= test__mulsf3(0x80fffffc, 0xbf000000, 0x007ffffe);
+  status |= test__mulsf3(0x80fffffe, 0x3f800000, 0x80fffffe);
+  status |= test__mulsf3(0x80ffffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0x80ffffff, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0x81000000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x81000000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xbf7fffff, 0xff7fffff, 0x7f7ffffe);
+  status |= test__mulsf3(0xbf800000, 0x00000009, 0x80000009);
+  status |= test__mulsf3(0xbf800000, 0x00800009, 0x80800009);
+  status |= test__mulsf3(0xbf800000, 0x3f800000, 0xbf800000);
+  status |= test__mulsf3(0xbf800000, 0x40000000, 0xc0000000);
+  status |= test__mulsf3(0xbf800000, 0xbf800000, 0x3f800000);
+  status |= test__mulsf3(0xbf800000, 0xc0000000, 0x40000000);
+  status |= test__mulsf3(0xbf800001, 0x3f800001, 0xbf800002);
+  status |= test__mulsf3(0xbf800001, 0xbf800001, 0x3f800002);
+  status |= test__mulsf3(0xbf800001, 0xbf800002, 0x3f800003);
+  status |= test__mulsf3(0xbf800002, 0x3f800001, 0xbf800003);
+  status |= test__mulsf3(0xbf800002, 0xbf800001, 0x3f800003);
+  status |= test__mulsf3(0xc0000000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0xc0000000, 0x007ffffd, 0x80fffffa);
+  status |= test__mulsf3(0xc0000000, 0x00800001, 0x81000001);
+  status |= test__mulsf3(0xc0000000, 0x00800005, 0x81000005);
+  status |= test__mulsf3(0xc0000000, 0x00800009, 0x81000009);
+  status |= test__mulsf3(0xc0000000, 0x40400000, 0xc0c00000);
+  status |= test__mulsf3(0xc0000000, 0x7e7fffff, 0xfeffffff);
+  status |= test__mulsf3(0xc0000000, 0x7e800001, 0xff000001);
+  status |= test__mulsf3(0xc0000000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xc0000000, 0xbf800000, 0x40000000);
+  status |= test__mulsf3(0xc0000000, 0xc0400000, 0x40c00000);
+  status |= test__mulsf3(0xc03ffffe, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xc03fffff, 0x3f7fffff, 0xc03ffffe);
+  status |= test__mulsf3(0xc0400000, 0x40400000, 0xc1100000);
+  status |= test__mulsf3(0xc0400000, 0xc0000000, 0x40c00000);
+  status |= test__mulsf3(0xc0400000, 0xc0400000, 0x41100000);
+  status |= test__mulsf3(0xc0400000, 0xff000000, 0x7f800000);
+  status |= test__mulsf3(0xc0400001, 0x3f800001, 0xc0400003);
+  status |= test__mulsf3(0xc0800000, 0x7e7fffff, 0xff7fffff);
+  status |= test__mulsf3(0xc0800000, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0xc0800000, 0xfe7fffff, 0x7f7fffff);
+  status |= test__mulsf3(0xc0800000, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0xc09ffffe, 0xff000000, 0x7f800000);
+  status |= test__mulsf3(0xc09fffff, 0xbf7fffff, 0x409ffffe);
+  status |= test__mulsf3(0xc0a00001, 0xbf800001, 0x40a00002);
+  status |= test__mulsf3(0xc0dffff9, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xc1100000, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xc1100001, 0xff000000, 0x7f800000);
+  status |= test__mulsf3(0xfe7ffff9, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xfe7ffff9, 0xc07fffff, 0x7f7ffff8);
+  status |= test__mulsf3(0xfe7ffffd, 0x40800000, 0xff7ffffd);
+  status |= test__mulsf3(0xfe7ffffd, 0xc0800000, 0x7f7ffffd);
+  status |= test__mulsf3(0xfe7fffff, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0xfe7fffff, 0x40000001, 0xff000000);
+  status |= test__mulsf3(0xfe7fffff, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xfe800000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0xfe800000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xfefffff7, 0x7e800001, 0xff800000);
+  status |= test__mulsf3(0xfeffffff, 0x3f800001, 0xff000000);
+  status |= test__mulsf3(0xfeffffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0xff000005, 0xff000001, 0x7f800000);
+  status |= test__mulsf3(0xff7ffffd, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xff7ffffd, 0xc0400001, 0x7f800000);
+  status |= test__mulsf3(0xff7ffffd, 0xff000001, 0x7f800000);
+  status |= test__mulsf3(0xff7fffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0xff7fffff, 0xff7fffff, 0x7f800000);
+  status |= test__mulsf3(0xff7fffff, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0x40c00000, 0xff800000);
+  status |= test__mulsf3(0xff800000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xff800000, 0x80000004, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0x80800000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0xc1000000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0xfe800000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0x3089705f, 0x0ef36390, 0x0041558f);
+  status |= test__mulsf3(0x3089705f, 0x0e936390, 0x0027907d);
+  status |= test__mulsf3(0x3109705f, 0x0ef36390, 0x0082ab1e);
+  status |= test__mulsf3(0x3109705f, 0x0e936390, 0x004f20fa);
+  status |= test__mulsf3(0x3189705f, 0x0ef36390, 0x0102ab1e);
+  status |= test__mulsf3(0x3189705f, 0x0e936390, 0x009e41f5);
+  status |= test__mulsf3(0xb089705f, 0x0ef36390, 0x8041558f);
+  status |= test__mulsf3(0xb089705f, 0x0e936390, 0x8027907d);
+  status |= test__mulsf3(0xb109705f, 0x0ef36390, 0x8082ab1e);
+  status |= test__mulsf3(0xb109705f, 0x0e936390, 0x804f20fa);
+  status |= test__mulsf3(0xb189705f, 0x0ef36390, 0x8102ab1e);
+  status |= test__mulsf3(0xb189705f, 0x0e936390, 0x809e41f5);
+  status |= test__mulsf3(0x3089705f, 0x8ef36390, 0x8041558f);
+  status |= test__mulsf3(0x3089705f, 0x8e936390, 0x8027907d);
+  status |= test__mulsf3(0x3109705f, 0x8ef36390, 0x8082ab1e);
+  status |= test__mulsf3(0x3109705f, 0x8e936390, 0x804f20fa);
+  status |= test__mulsf3(0x3189705f, 0x8ef36390, 0x8102ab1e);
+  status |= test__mulsf3(0x3189705f, 0x8e936390, 0x809e41f5);
+  status |= test__mulsf3(0xb089705f, 0x8ef36390, 0x0041558f);
+  status |= test__mulsf3(0xb089705f, 0x8e936390, 0x0027907d);
+  status |= test__mulsf3(0xb109705f, 0x8ef36390, 0x0082ab1e);
+  status |= test__mulsf3(0xb109705f, 0x8e936390, 0x004f20fa);
+  status |= test__mulsf3(0xb189705f, 0x8ef36390, 0x0102ab1e);
+  status |= test__mulsf3(0xb189705f, 0x8e936390, 0x009e41f5);
+  status |= test__mulsf3(0x1f800001, 0x1fc00000, 0x00300000);
+  status |= test__mulsf3(0x1f800003, 0x1fc00000, 0x00300001);
+  status |= test__mulsf3(0x1f800001, 0x1fc00800, 0x00300200);
+  status |= test__mulsf3(0x1f800003, 0x1fc00800, 0x00300201);
+  status |= test__mulsf3(0x36e4588a, 0x29b47cbd, 0x2120fd85);
+  status |= test__mulsf3(0x3fea3b26, 0x3f400000, 0x3fafac5c);
+  status |= test__mulsf3(0x6fea3b26, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20ea3b26, 0x1ec00000, 0x0057d62e);
+  status |= test__mulsf3(0x3f8f11bb, 0x3fc00000, 0x3fd69a98);
+  status |= test__mulsf3(0x6f8f11bb, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f400000, 0x006b4d4c);
+  status |= test__mulsf3(0x3f8f11bb, 0x3f800000, 0x3f8f11bb);
+  status |= test__mulsf3(0x6f8f11bb, 0x4f800000, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f000000, 0x004788de);
+  status |= test__mulsf3(0x3f8f11bb, 0x3fd7f48d, 0x3ff1611f);
+  status |= test__mulsf3(0x6f8f11bb, 0x4fd7f48d, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f57f48d, 0x0078b090);
+  status |= test__mulsf3(0x3f8f11bb, 0x3fa80b73, 0x3fbbd412);
+  status |= test__mulsf3(0x6f8f11bb, 0x4fa80b73, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f280b73, 0x005dea09);
+  status |= test__mulsf3(0x3f8f11bb, 0x3f97f48d, 0x3fa9d842);
+  status |= test__mulsf3(0x6f8f11bb, 0x4f97f48d, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f17f48d, 0x0054ec21);
+  status |= test__mulsf3(0x3f8f11bb, 0x3f680b73, 0x3f81ae78);
+  status |= test__mulsf3(0x6f8f11bb, 0x4f680b73, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1ee80b73, 0x0040d73c);
+  status |= test__mulsf3(0x3fff5dd8, 0x3f600000, 0x3fdf721d);
+  status |= test__mulsf3(0x6fff5dd8, 0x4f600000, 0x7f800000);
+  status |= test__mulsf3(0x20ff5dd8, 0x1ee00000, 0x006fb90e);
+  status |= test__mulsf3(0x3fff5dd8, 0x3f100000, 0x3f8fa4ca);
+  status |= test__mulsf3(0x6fff5dd8, 0x4f100000, 0x7f800000);
+  status |= test__mulsf3(0x20ff5dd8, 0x1e900000, 0x0047d265);
+  status |= test__mulsf3(0x3fffe96b, 0x3f7efb43, 0x3ffee4c5);
+  status |= test__mulsf3(0x6fffe96b, 0x4f7efb43, 0x7f800000);
+  status |= test__mulsf3(0x20ffe96b, 0x1efefb43, 0x007f7263);
+  status |= test__mulsf3(0x3fffe96b, 0x3f0104bd, 0x3f80f95b);
+  status |= test__mulsf3(0x6fffe96b, 0x4f0104bd, 0x7f800000);
+  status |= test__mulsf3(0x20ffe96b, 0x1e8104bd, 0x00407cae);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3fa6edf9, 0x3fbb72aa);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4fa6edf9, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1f26edf9, 0x005db955);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3fd91207, 0x3ff3c07b);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4fd91207, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1f591207, 0x0079e03d);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3f991207, 0x3fabe29f);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4f991207, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1f191207, 0x0055f150);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3f66edf9, 0x3f81a843);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4f66edf9, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1ee6edf9, 0x0040d421);
+  status |= test__mulsf3(0x3fdb62f3, 0x3f7879c5, 0x3fd4f036);
+  status |= test__mulsf3(0x6fdb62f3, 0x4f7879c5, 0x7f800000);
+  status |= test__mulsf3(0x20db62f3, 0x1ef879c5, 0x006a781b);
+  status |= test__mulsf3(0x3faaea45, 0x3f8b6773, 0x3fba2489);
+  status |= test__mulsf3(0x6faaea45, 0x4f8b6773, 0x7f800000);
+  status |= test__mulsf3(0x20aaea45, 0x1f0b6773, 0x005d1244);
+  status |= test__mulsf3(0x3fafa7ec, 0x3f900000, 0x3fc59cea);
+  status |= test__mulsf3(0x6fafa7ec, 0x4f900000, 0x7f800000);
+  status |= test__mulsf3(0x20afa7ec, 0x1f100000, 0x0062ce75);
+  status |= test__mulsf3(0x3fcf8c8d, 0x3f271645, 0x3f8776be);
+  status |= test__mulsf3(0x6fcf8c8d, 0x4f271645, 0x7f800000);
+  status |= test__mulsf3(0x20cf8c8d, 0x1ea71645, 0x0043bb5f);
+  status |= test__mulsf3(0x3fc173ef, 0x3f901b0f, 0x3fd9cb52);
+  status |= test__mulsf3(0x6fc173ef, 0x4f901b0f, 0x7f800000);
+  status |= test__mulsf3(0x20c173ef, 0x1f101b0f, 0x006ce5a9);
+  status |= test__mulsf3(0x3fb48d33, 0x3f4a35fb, 0x3f8e9d7d);
+  status |= test__mulsf3(0x6fb48d33, 0x4f4a35fb, 0x7f800000);
+  status |= test__mulsf3(0x20b48d33, 0x1eca35fb, 0x00474ebe);
+  status |= test__mulsf3(0x3fc6f87b, 0x3f65d94d, 0x3fb2a52a);
+  status |= test__mulsf3(0x6fc6f87b, 0x4f65d94d, 0x7f800000);
+  status |= test__mulsf3(0x20c6f87b, 0x1ee5d94d, 0x00595295);
+  status |= test__mulsf3(0x3f860ae7, 0x3f969729, 0x3f9db312);
+  status |= test__mulsf3(0x6f860ae7, 0x4f969729, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f169729, 0x004ed989);
+  status |= test__mulsf3(0x3f860ae7, 0x3fc00000, 0x3fc9105a);
+  status |= test__mulsf3(0x6f860ae7, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f400000, 0x0064882d);
+  status |= test__mulsf3(0x3f860ae7, 0x3fe968d7, 0x3ff46da3);
+  status |= test__mulsf3(0x6f860ae7, 0x4fe968d7, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f6968d7, 0x007a36d1);
+  status |= test__mulsf3(0x3f860ae7, 0x3f800000, 0x3f860ae7);
+  status |= test__mulsf3(0x6f860ae7, 0x4f800000, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f000000, 0x00430574);
+  status |= test__mulsf3(0x3f860ae7, 0x3fa968d7, 0x3fb1682f);
+  status |= test__mulsf3(0x6f860ae7, 0x4fa968d7, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f2968d7, 0x0058b418);
+  status |= test__mulsf3(0x3f860ae7, 0x3fd69729, 0x3fe0b886);
+  status |= test__mulsf3(0x6f860ae7, 0x4fd69729, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f569729, 0x00705c43);
+  status |= test__mulsf3(0x3f9aecdd, 0x3fb14b75, 0x3fd696de);
+  status |= test__mulsf3(0x6f9aecdd, 0x4fb14b75, 0x7f800000);
+  status |= test__mulsf3(0x209aecdd, 0x1f314b75, 0x006b4b6f);
+  status |= test__mulsf3(0x3f9aecdd, 0x3fceb48b, 0x3ffa2fb9);
+  status |= test__mulsf3(0x6f9aecdd, 0x4fceb48b, 0x7f800000);
+  status |= test__mulsf3(0x209aecdd, 0x1f4eb48b, 0x007d17dc);
+  status |= test__mulsf3(0x3f9aecdd, 0x3fc00000, 0x3fe8634c);
+  status |= test__mulsf3(0x6f9aecdd, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x209aecdd, 0x1f400000, 0x007431a6);
+  status |= test__mulsf3(0x3fd65dc6, 0x3f400000, 0x3fa0c654);
+  status |= test__mulsf3(0x6fd65dc6, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20d65dc6, 0x1ec00000, 0x0050632a);
+  status |= test__mulsf3(0x3feecf03, 0x3f5f93ab, 0x3fd09014);
+  status |= test__mulsf3(0x6feecf03, 0x4f5f93ab, 0x7f800000);
+  status |= test__mulsf3(0x20eecf03, 0x1edf93ab, 0x0068480a);
+  status |= test__mulsf3(0x3feecf03, 0x3f206c55, 0x3f95a670);
+  status |= test__mulsf3(0x6feecf03, 0x4f206c55, 0x7f800000);
+  status |= test__mulsf3(0x20eecf03, 0x1ea06c55, 0x004ad338);
+  status |= test__mulsf3(0x3f98feed, 0x3f60f11b, 0x3f866f27);
+  status |= test__mulsf3(0x6f98feed, 0x4f60f11b, 0x7f800000);
+  status |= test__mulsf3(0x2098feed, 0x1ee0f11b, 0x00433794);
+  status |= test__mulsf3(0x3f9a1b9d, 0x3f9c42b5, 0x3fbc21f8);
+  status |= test__mulsf3(0x6f9a1b9d, 0x4f9c42b5, 0x7f800000);
+  status |= test__mulsf3(0x209a1b9d, 0x1f1c42b5, 0x005e10fc);
+  status |= test__mulsf3(0x3f9a1b9d, 0x3f5c42b5, 0x3f8497e3);
+  status |= test__mulsf3(0x6f9a1b9d, 0x4f5c42b5, 0x7f800000);
+  status |= test__mulsf3(0x209a1b9d, 0x1edc42b5, 0x00424bf2);
+  status |= test__mulsf3(0x3f947044, 0x3f600000, 0x3f81e23c);
+  status |= test__mulsf3(0x6f947044, 0x4f600000, 0x7f800000);
+  status |= test__mulsf3(0x20947044, 0x1ee00000, 0x0040f11e);
+  status |= test__mulsf3(0x3fa3fb77, 0x3f6eb1b9, 0x3f98e5a0);
+  status |= test__mulsf3(0x6fa3fb77, 0x4f6eb1b9, 0x7f800000);
+  status |= test__mulsf3(0x20a3fb77, 0x1eeeb1b9, 0x004c72d0);
+  status |= test__mulsf3(0x3fb291df, 0x3f466a1f, 0x3f8a66d9);
+  status |= test__mulsf3(0x6fb291df, 0x4f466a1f, 0x7f800000);
+  status |= test__mulsf3(0x20b291df, 0x1ec66a1f, 0x0045336c);
+  status |= test__mulsf3(0x3fde13d5, 0x3f6b7283, 0x3fcc3f8b);
+  status |= test__mulsf3(0x6fde13d5, 0x4f6b7283, 0x7f800000);
+  status |= test__mulsf3(0x20de13d5, 0x1eeb7283, 0x00661fc5);
+  status |= test__mulsf3(0x3fd5b211, 0x3f80810f, 0x3fd68987);
+  status |= test__mulsf3(0x6fd5b211, 0x4f80810f, 0x7f800000);
+  status |= test__mulsf3(0x20d5b211, 0x1f00810f, 0x006b44c4);
+  status |= test__mulsf3(0x3fd5b211, 0x3f3f7ef1, 0x3f9fd9d2);
+  status |= test__mulsf3(0x6fd5b211, 0x4f3f7ef1, 0x7f800000);
+  status |= test__mulsf3(0x20d5b211, 0x1ebf7ef1, 0x004fece9);
+  status |= test__mulsf3(0x3fadfbc4, 0x3f400000, 0x3f827cd3);
+  status |= test__mulsf3(0x6fadfbc4, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20adfbc4, 0x1ec00000, 0x00413e6a);
+  status |= test__mulsf3(0x3fd0ef03, 0x3f800000, 0x3fd0ef03);
+  status |= test__mulsf3(0x6fd0ef03, 0x4f800000, 0x7f800000);
+  status |= test__mulsf3(0x20d0ef03, 0x1f000000, 0x00687782);
+  status |= test__mulsf3(0x3fd0ef03, 0x3f8673ab, 0x3fdb7705);
+  status |= test__mulsf3(0x6fd0ef03, 0x4f8673ab, 0x7f800000);
+  status |= test__mulsf3(0x20d0ef03, 0x1f0673ab, 0x006dbb83);
+  status |= test__mulsf3(0x3fd0ef03, 0x3f798c55, 0x3fcbab02);
+  status |= test__mulsf3(0x6fd0ef03, 0x4f798c55, 0x7f800000);
+  status |= test__mulsf3(0x20d0ef03, 0x1ef98c55, 0x0065d581);
+  status |= test__mulsf3(0x3fdd1181, 0x3f8ad17f, 0x3fefc0b1);
+  status |= test__mulsf3(0x6fdd1181, 0x4f8ad17f, 0x7f800000);
+  status |= test__mulsf3(0x20dd1181, 0x1f0ad17f, 0x0077e058);
+  status |= test__mulsf3(0x3fdd1181, 0x3f752e81, 0x3fd3b9e9);
+  status |= test__mulsf3(0x6fdd1181, 0x4f752e81, 0x7f800000);
+  status |= test__mulsf3(0x20dd1181, 0x1ef52e81, 0x0069dcf5);
+  status |= test__mulsf3(0x3f92efc6, 0x3fa00000, 0x3fb7abb8);
+  status |= test__mulsf3(0x6f92efc6, 0x4fa00000, 0x7f800000);
+  status |= test__mulsf3(0x2092efc6, 0x1f200000, 0x005bd5dc);
+  status |= test__mulsf3(0x3fdcefe6, 0x3f400000, 0x3fa5b3ec);
+  status |= test__mulsf3(0x6fdcefe6, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20dcefe6, 0x1ec00000, 0x0052d9f6);
+  status |= test__mulsf3(0x3fad6507, 0x3fa2f8b7, 0x3fdcc4c9);
+  status |= test__mulsf3(0x6fad6507, 0x4fa2f8b7, 0x7f800000);
+  status |= test__mulsf3(0x20ad6507, 0x1f22f8b7, 0x006e6264);
+  status |= test__mulsf3(0x3fad6507, 0x3f62f8b7, 0x3f99bba6);
+  status |= test__mulsf3(0x6fad6507, 0x4f62f8b7, 0x7f800000);
+  status |= test__mulsf3(0x20ad6507, 0x1ee2f8b7, 0x004cddd3);
+  status |= test__mulsf3(0x3fbfde6b, 0x3f8721bd, 0x3fca8f27);
+  status |= test__mulsf3(0x6fbfde6b, 0x4f8721bd, 0x7f800000);
+  status |= test__mulsf3(0x20bfde6b, 0x1f0721bd, 0x00654794);
+  status |= test__mulsf3(0x3fbfde6b, 0x3f4721bd, 0x3f953f2e);
+  status |= test__mulsf3(0x6fbfde6b, 0x4f4721bd, 0x7f800000);
+  status |= test__mulsf3(0x20bfde6b, 0x1ec721bd, 0x004a9f97);
+  status |= test__mulsf3(0x3ff40db4, 0x3f400000, 0x3fb70a47);
+  status |= test__mulsf3(0x6ff40db4, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20f40db4, 0x1ec00000, 0x005b8524);
+  status |= test__mulsf3(0x3ff40db4, 0x3f600000, 0x3fd58bfe);
+  status |= test__mulsf3(0x6ff40db4, 0x4f600000, 0x7f800000);
+  status |= test__mulsf3(0x20f40db4, 0x1ee00000, 0x006ac5ff);
+  status |= test__mulsf3(0x3f9e20d3, 0x3f90c8a5, 0x3fb2dccc);
+  status |= test__mulsf3(0x6f9e20d3, 0x4f90c8a5, 0x7f800000);
+  status |= test__mulsf3(0x209e20d3, 0x1f10c8a5, 0x00596e66);
+  status |= test__mulsf3(0x3f9e20d3, 0x3fc00000, 0x3fed313c);
+  status |= test__mulsf3(0x6f9e20d3, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x209e20d3, 0x1f400000, 0x0076989e);
+  status |= test__mulsf3(0x3f9e20d3, 0x3f50c8a5, 0x3f80f69b);
+  status |= test__mulsf3(0x6f9e20d3, 0x4f50c8a5, 0x7f800000);
+  status |= test__mulsf3(0x209e20d3, 0x1ed0c8a5, 0x00407b4d);
+  status |= test__mulsf3(0x3f82e641, 0x3f8fd63f, 0x3f931856);
+  status |= test__mulsf3(0x6f82e641, 0x4f8fd63f, 0x7f800000);
+  status |= test__mulsf3(0x2082e641, 0x1f0fd63f, 0x00498c2b);
+  status |= test__mulsf3(0x3f9a1901, 0x3f96e701, 0x3fb5ab68);
+  status |= test__mulsf3(0x6f9a1901, 0x4f96e701, 0x7f800000);
+  status |= test__mulsf3(0x209a1901, 0x1f16e701, 0x005ad5b4);
+  status |= test__mulsf3(0x3fa21aa1, 0x3f7c4961, 0x3f9fc0ae);
+  status |= test__mulsf3(0x6fa21aa1, 0x4f7c4961, 0x7f800000);
+  status |= test__mulsf3(0x20a21aa1, 0x1efc4961, 0x004fe057);
+  status |= test__mulsf3(0x3fcd0767, 0x3f782457, 0x3fc6bc47);
+  status |= test__mulsf3(0x6fcd0767, 0x4f782457, 0x7f800000);
+  status |= test__mulsf3(0x20cd0767, 0x1ef82457, 0x00635e23);
+  status |= test__mulsf3(0x3fb875e1, 0x3f968e21, 0x3fd8f6f6);
+  status |= test__mulsf3(0x6fb875e1, 0x4f968e21, 0x7f800000);
+  status |= test__mulsf3(0x20b875e1, 0x1f168e21, 0x006c7b7b);
+  status |= test__mulsf3(0x3fc2f0d7, 0x3f5efd19, 0x3fa9cd95);
+  status |= test__mulsf3(0x6fc2f0d7, 0x4f5efd19, 0x7f800000);
+  status |= test__mulsf3(0x20c2f0d7, 0x1edefd19, 0x0054e6cb);
+  status |= test__mulsf3(0x7f7ffffe, 0x3f800001, 0x7f800000);
+  status |= test__mulsf3(0x00000003, 0xc00fffff, 0x80000007);
+  status |= test__mulsf3(0x00000003, 0x400fffff, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0xc00fffff, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0x400fffff, 0x80000007);
+  status |= test__mulsf3(0x00000003, 0xc00ffffd, 0x80000007);
+  status |= test__mulsf3(0x00000003, 0x400ffffd, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0xc00ffffd, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0x400ffffd, 0x80000007);
+  status |= test__mulsf3(0x3e00007f, 0x017c0000, 0x003f003f);
+  status |= test__mulsf3(0xcf7fff00, 0xc0ffff00, 0x50fffe00);
+  status |= test__mulsf3(0x3fdf7f00, 0x3fffff00, 0x405f7e21);
+  status |= test__mulsf3(0x19b92144, 0x1a310000, 0x00000001);
+  status |= test__mulsf3(0x19ffc008, 0x1a002004, 0x00000001);
+  status |= test__mulsf3(0x7f7ffff0, 0xc0000008, 0xff800000);
+
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+  // which causes compareResultF to accept any NaN encoding. We also use the
+  // same value as the input NaN in tests that have one, so that even in
+  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+  // still the exact expected NaN.
+  status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000);
+  status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000);
+  status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000);
+  status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000);
+  status |= test__mulsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+  status |= test__mulsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+  status |= test__mulsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/mulsf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7fc00000.
+
+  status |= test__mulsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+  status |= test__mulsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+  status |= test__mulsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+  status |= test__mulsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+  status |= test__mulsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+  status |= test__mulsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+  status |= test__mulsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+  status |= test__mulsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+  status |= test__mulsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+  status |= test__mulsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+  status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000);
+  status |= test__mulsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+  status |= test__mulsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+  status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000);
+  status |= test__mulsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+  status |= test__mulsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+  status |= test__mulsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+  status |= test__mulsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+  status |= test__mulsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+  status |= test__mulsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+  status |= test__mulsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+  status |= test__mulsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+  status |= test__mulsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+  status |= test__mulsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+  status |= test__mulsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+  status |= test__mulsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+  status |= test__mulsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+  status |= test__mulsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+  status |= test__mulsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+  status |= test__mulsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+  status |= test__mulsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+  status |= test__mulsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+  status |= test__mulsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+  status |= test__mulsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+  status |= test__mulsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+  status |= test__mulsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+  status |= test__mulsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+  status |= test__mulsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+  status |= test__mulsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+  status |= test__mulsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+  status |= test__mulsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+  status |= test__mulsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+  status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000);
+  status |= test__mulsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+  status |= test__mulsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+  status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000);
+  status |= test__mulsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+  status |= test__mulsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+  status |= test__mulsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+  status |= test__mulsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+  status |= test__mulsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+  status |= test__mulsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+  status |= test__mulsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+  status |= test__mulsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+  status |= test__mulsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+  status |= test__mulsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+#endif // ARM_NAN_HANDLING
+
+  return status;
+}