From 6e3b7ab47c61fdbac42c23322772a11795c49269 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Wed, 1 Oct 2025 17:06:31 +0100
Subject: [PATCH 1/8] [compiler-rt][ARM] Optimized mulsf3 and divsf3

This commit adds optimized assembly versions of single-precision float
multiplication and division. Both functions are implemented in a style
that can be assembled as either of Arm and Thumb2; for multiplication,
a separate implementation is provided for Thumb1. Also, extensive new
tests are added for multiplication and division.

These implementations can be removed from the build by defining the
cmake variable COMPILER_RT_ARM_OPTIMIZED_FP=OFF.

Outlying parts of the functionality which are not on the fast path,
such as NaN handling and underflow, are handled in helper functions
written in C. These can be shared between the Arm/Thumb2 and Thumb1
implementations, and also reused by other optimized assembly functions
we hope to add in future.
---
 compiler-rt/lib/builtins/CMakeLists.txt      |  23 +
 compiler-rt/lib/builtins/arm/divsf3.S        | 608 +++++++++++++++++++
 compiler-rt/lib/builtins/arm/fnan2.c         |  38 ++
 compiler-rt/lib/builtins/arm/fnorm2.c        |  62 ++
 compiler-rt/lib/builtins/arm/funder.c        |  78 +++
 compiler-rt/lib/builtins/arm/mulsf3.S        | 309 ++++++++++
 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S | 251 ++++++++
 compiler-rt/test/builtins/Unit/divsf3_test.c | 469 +++++++++++---
 compiler-rt/test/builtins/Unit/mulsf3_test.c | 584 ++++++++++++++++++
 9 files changed, 2326 insertions(+), 96 deletions(-)
 create mode 100644 compiler-rt/lib/builtins/arm/divsf3.S
 create mode 100644 compiler-rt/lib/builtins/arm/fnan2.c
 create mode 100644 compiler-rt/lib/builtins/arm/fnorm2.c
 create mode 100644 compiler-rt/lib/builtins/arm/funder.c
 create mode 100644 compiler-rt/lib/builtins/arm/mulsf3.S
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
 create mode 100644 compiler-rt/test/builtins/Unit/mulsf3_test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 9095b056ae782..d7a0a139c5ad6 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -422,6 +422,22 @@ set(arm_or_thumb2_base_SOURCES
   ${GENERIC_SOURCES}
 )
 
+option(COMPILER_RT_ARM_OPTIMIZED_FP
+  "On 32-bit Arm, use optimized assembly implementations of FP arithmetic" ON)
+
+if(COMPILER_RT_ARM_OPTIMIZED_FP)
+  set(arm_or_thumb2_base_SOURCES
+    arm/mulsf3.S
+    arm/divsf3.S
+    arm/fnan2.c
+    arm/fnorm2.c
+    arm/funder.c
+    ${arm_or_thumb2_base_SOURCES}
+  )
+endif()
+set_source_files_properties(arm/mulsf3.S arm/divsf3.S
+  PROPERTIES COMPILE_OPTIONS "-Wa,-mimplicit-it=always")
+
 set(arm_sync_SOURCES
   arm/sync_fetch_and_add_4.S
   arm/sync_fetch_and_add_8.S
@@ -455,6 +471,13 @@ set(thumb1_base_SOURCES
   ${GENERIC_SOURCES}
 )
 
+if(COMPILER_RT_ARM_OPTIMIZED_FP)
+  set(thumb1_base_SOURCES
+    arm/thumb1/mulsf3.S
+    ${thumb1_base_SOURCES}
+  )
+endif()
+
 set(arm_EABI_RT_SOURCES
   arm/aeabi_cdcmp.S
   arm/aeabi_cdcmpeq_check_nan.c
diff --git a/compiler-rt/lib/builtins/arm/divsf3.S b/compiler-rt/lib/builtins/arm/divsf3.S
new file mode 100644
index 0000000000000..bf07217cba907
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/divsf3.S
@@ -0,0 +1,608 @@
+//===-- divsf3.S - single-precision floating point division ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float division with the IEEE-754
+// default rounding (to nearest, ties to even), in optimized AArch32 assembly
+// language suitable to be built as either Arm or Thumb2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+
+  .syntax unified
+  .text
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fdiv, __divsf3)
+
+DEFINE_COMPILERRT_FUNCTION(__divsf3)
+  // Extract the exponents of the inputs into r2 and r3, occupying bits 16-23
+  // of each register so that there will be space lower down to store extra
+  // data without exponent arithmetic carrying into it. In the process, check
+  // both exponents for 00 or FF and branch out of line to handle all the
+  // uncommon types of value (infinity, NaN, zero, denormals).
+  //
+  // Chaining conditional instructions like this means that the second
+  // instruction (setting up r3) might not be executed at all, so fdiv_uncommon
+  // will have to redo it just in case. That saves an instruction here,
+  // executed for _all_ inputs, and moves it to the uncommon path run for only
+  // some inputs.
+  MOV     r12, #0xFF0000
+  ANDS    r2, r12, r0, LSR #7   // r2 has exponent of numerator. (Is it 0?)
+  ANDSNE  r3, r12, r1, LSR #7   // r3 has exponent of denominator. (Is it 0?)
+  TEQNE   r2, r12               // if neither was 0, is one FF?
+  TEQNE   r3, r12               // or the other?
+  BEQ     LOCAL_LABEL(uncommon)         // branch out of line if any answer was yes
+
+  // Calculate the output sign, which is always just the XOR of the input
+  // signs. Store it in bit 8 of r2, below the numerator exponent.
+  TEQ     r0, r1                // is the output sign bit 1?
+  ORRMI   r2, r2, #0x100        // if so, set bit 8 of r2
+
+  // Isolate the mantissas of both values, by setting bit 23 of each one and
+  // clearing the 8 bits above that.
+  //
+  // In the process, swap the register allocations (which doesn't cost extra
+  // instructions if we do it as part of this manipulation). We want the
+  // numerator not to be in r0, because r0 is where we'll build up the quotient
+  // while subtracting things from the numerator.
+  ORR     r12, r0, #1 << 23
+  ORR     r0, r1, #1 << 23
+  BIC     r1, r12, #0xFF000000
+  BIC     r0, r0, #0xFF000000
+
+LOCAL_LABEL(div):
+  // Start of the main division. We get here knowing that:
+  //
+  //   r0 = mantissa of denominator, with the leading 1 at bit 23
+  //   r1 = mantissa of numerator, similarly
+  //   r2 = (exponent of numerator << 16) + (result sign << 8)
+  //   r3 = (exponent of denominator << 16)
+
+  PUSH    {r14}                 // we'll need an extra register
+
+  // Calculate the initial result exponent by just subtracting the two input
+  // exponents. This doesn't affect the sign bit lower down in r2.
+  SUB     r2, r2, r3
+
+  // That initial exponent might need to be adjusted by 1, depending on whether
+  // dividing the mantissas gives a value >=1 or <1. We don't need to wait
+  // until the division is finished to work that out: we can tell immediately
+  // by just comparing the mantissas.
+  //
+  // The basic idea is to do the comparison in a way that sets the C flag if
+  // numerator >= denominator. Then we recombine the sign and exponent by doing
+  // "ADC r2, r2, r2, ASR #16": the exponent in the top half of r2 is shifted
+  // down to the low 8 bits, just below the sign bit, and using ADC rather than
+  // ADD folds in the conditional increment from the mantissa comparison.
+  //
+  // If we're not incrementing the output exponent, we instead shift the
+  // numerator mantissa left by 1, so that it _is_ greater than the denominator
+  // mantissa. Otherwise we'd generate only a 22-bit quotient, instead of 23.
+  //
+  // The exponent also needs to be rebiased, so that dividing two numbers the
+  // same gives an output exponent of 0x7F. If the two inputs have the same
+  // exponent then we'll have computed an exponent of 0 via the SUB instruction
+  // above; if the mantissas are the same as well then the ADC will increment
+  // it; also, the leading bit of the quotient will increment the exponent
+  // again when we recombine it with the output mantissa later. So we need to
+  // add (0x7F - 2) to the mantissa now, to make an exponent of 0 from the SUB
+  // come to 0x7F after both of those increments.
+  //
+  // Putting all of that together, what we _want_ to do is this:
+  //
+  // [#1]   CMP     r1, r0                // set C if num >= den
+  // [#2]   MOVLO   r1, r1, LSL #1        // if num < den, shift num left
+  // [#3]   ADD     r2, r2, #0x7D0000     // rebias exponent
+  // [#4]   ADC     r2, r2, r2, ASR #16   // combine sign + exp + adjustment
+  //
+  // However, we only do the first of those four instructions right here. The
+  // other three are distributed through the code below, after unrelated load
+  // or multiply instructions which will have a result delay slot on simple
+  // CPUs. Each is labelled "exponent setup [#n]" in a comment.
+  //
+  // (Since instruction #4 depends on the flags set up by #2, we must avoid
+  // clobbering the flags in _any_ of the instructions interleaved with this!)
+  CMP     r1, r0                // exponent setup [#1]
+
+  // Start the mantissa division by making an approximation to the reciprocal
+  // of the denominator. We first obtain an 8-bit approximation using a table
+  // lookup indexed by the top 7 denominator bits (counting the leading 1, so
+  // really there are only 6 bits in the table index).
+  //
+  // (r0 >> 17) is the table index, and its top bit is always set, so it ranges
+  // from 64 to 127 inclusive. So we point the base register 64 bytes before
+  // the actual table.
+  ADR     r12, LOCAL_LABEL(tab) - 64
+#if __thumb__
+  // Thumb can't do this particular shift+add+load in one instruction - it only
+  // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must
+  // calculate the load offset separately.
+  ADD     r14, r12, r0, LSR #17
+  LDRB    r14, [r14]
+#else
+  LDRB    r14, [r12, r0, LSR #17]
+#endif
+
+  // Now do an iteration of Newton-Raphson to improve that 8-bit approximation
+  // to have 15-16 accurate bits.
+  //
+  // Basics of Newton-Raphson for finding a reciprocal: if you want to find 1/d
+  // and you have some approximation x, your next approximation is X = x(2-dx).
+  // Looked at one way, this is the result of applying the N-R formula
+  // X=x-f(x)/f'(x) to the function f(x) = 1/x - d. Another way to look at it
+  // is to suppose that dx = 1 - e, for some e which is small (because dx is
+  // already reasonably close to 1). Then you want to double the number of
+  // correct bits in the next approximation, i.e. square the error. So you want
+  // dX = 1-e^2 = (1-e)(1+e) = dx(2-dx). Cancelling d gives X = x(2-dx) again.
+  //
+  // In this situation, we're working in fixed-point integers rather than real
+  // numbers, and all the scales are different:
+  //  * our input denominator d is in the range [2^23,2^24)
+  //  * our input approximation x is in the range [2^7,2^8)
+  //  * we want the output approximation to be in the range [2^15,2^16)
+  // Those factors combine to mean that we want
+  //   x(2^32-dx) / 2^23
+  // = (2^9 x) - (dx^2 / 2^23)
+  //
+  // But we also want to compute this using ordinary MUL, not a long multiply
+  // instruction (those are slower). So we need to worry about the product
+  // overflowing. dx fits in 32 bits, because it's the product of something
+  // <2^24 with something <2^8; but we must shift it right before multiplying
+  // by x again.
+
+  MUL     r12, r0, r14          // r12  = dx
+  MOVLO   r1, r1, LSL #1        //   exponent setup [#2] in the MUL delay slot
+  MVN     r12, r12, LSR #8      // r12 ~= -dx/2^8
+  MUL     r3, r12, r14          // r3  ~= -dx^2/2^8
+  MOV     r14, r14, LSL #9      // r14  = 2^9 x
+  ADD     r14, r14, r3, ASR #15 // r14 ~= 2^9 x - dx^2 / 2^23
+
+  // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa,
+  // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16
+  // in principle, and the max mantissa 2^24-1 would have reciprocal just over
+  // 2^15). The error is always negative (r14 is an underestimate of the true
+  // value), and the maximum error is 6 and a bit ULP (that is, the true
+  // reciprocal is strictly less than (r14+7)). Also, r14 is always strictly
+  // less than 0x10000 (even in the case of the min mantissa, where the true
+  // value would be _exactly_ 0x10000), which eliminates a case of integer
+  // overflow.
+  //
+  // All of these properties of the reciprocal approximation are checked by
+  // exhaustively iterating over all 2^23 possible input mantissas. (The nice
+  // thing about doing this in single rather than double precision!)
+  //
+  // Now we extract most of the quotient by two steps of long division, using
+  // the reciprocal estimate to identify a multiple of the denominator to
+  // subtract from the numerator. To avoid integer overflow, the numerator
+  // mantissa is shifted down 8 bits so that it's less than 0x10000. After we
+  // calculate an approximate quotient, we shift the numerator left and
+  // subtract that multiple of the denominator, moving the next portion of the
+  // numerator into range for the next iteration.
+
+  // First iteration of long division. We shift the numerator left 11 bits, and
+  // since the quotient approximation is scaled by 2^31, we must shift that
+  // right by 20 to make the right product to subtract from the numerator.
+  MOV     r12, r1, LSR #8       // shift the numerator down
+  MUL     r12, r14, r12         // make the quotient approximation
+  MOV     r1, r1, LSL #11       // shift numerator left, ready for subtraction
+  MOV     r3, r12, LSR #20      // make first 12-bit block of quotient bits
+  MLS     r1, r0, r3, r1        // subtract that multiple of den from num
+
+  ADD     r2, r2, #0x7D0000     //   exponent setup [#3] in the MLS delay slot
+
+  // Second iteration of long division. Differences from the first step: this
+  // time we shift the numerator 12 bits instead of 11, so that the total of
+  // both steps is 23 bits, i.e. we've shifted up by exactly the full width of
+  // the output mantissa. Also, the block of output quotient bits is left in a
+  // different register: it was in r3 the first time, and this time it's in
+  // r12, so that we still have both available at the end of the process.
+  MOV     r12, r1, LSR #8       // shift the numerator down
+  MUL     r12, r14, r12         // make the quotient approximation
+  MOV     r1, r1, LSL #12       // shift numerator left, ready for subtraction
+  MOV     r12, r12, LSR #19     // make second 11-bit block of quotient
+  MLS     r1, r0, r12, r1       // subtract that multiple of den from num
+
+  ADC     r2, r2, r2, ASR #16   //   exponent setup [#4] in the MLS delay slot
+
+  // Now r1 contains the original numerator, shifted left 23, minus _some_
+  // multiple of the original denominator (which is still in r0). The bounds on
+  // the error in the above steps should make the error at most 1: that is, we
+  // may have to subtract the denominator one more time to make r1 < r0, and
+  // increment the quotient by one more.
+  //
+  // Our quotient is still in two pieces, computed separately in the above long
+  // division steps. We fold the final increment into the same instruction that
+  // recombines them, by doing the comparison in such a way that it sets the
+  // carry flag if the increment is needed.
+
+  CMP     r1, r0                // Set carry flag if num >= den
+  SUBHS   r1, r1, r0            // If so, subtract den from num
+  ADC     r3, r12, r3, LSL #12  // Recombine quotient halves, plus optional +1
+
+  // We've finished with r14 as a temporary register, so we can unstack it now.
+  POP     {r14}
+
+  // Now r3 contains the _rounded-down_ output quotient, and r1 contains the
+  // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and
+  // 0 <= r1 < denominator.
+  //
+  // Next we must round to nearest, by checking if r1 is greater than half the
+  // denominator. In division, it's not possible to hit an exact round-to-even
+  // halfway case, so we don't need to spend any time checking for it.
+  //
+  // Proof of no round-to-even: define the 'width' of a dyadic rational to be
+  // the distance between the lowest and highest 1 bits in its binary
+  // representation, or equivalently, the index of its high bit if you scale it
+  // by a power of 2 to make it an odd integer. E.g. any actual power of 2 has
+  // width 0, and all of 0b11110, 0b1111, 0b11.11 and 0b0.01111 have width 3.
+  // Then for any dyadic rationals a,b, width(ab) >= width(a)+width(b). Let w
+  // be the maximum width that the input precision supports (so that for single
+  // precision, w=23). Then if some division n/d were a round-to-even case, the
+  // true quotient q=n/d would have width exactly w+1. But we have qd=n, so
+  // width(n) >= width(q)+width(d) > w, which can't happen, because n is in the
+  // input precision, hence had width <= w.)
+  //
+  // So we don't need to check for an exact _halfway_ case and clear the low
+  // bit of the quotient after rounding up, as addition and multiplication both
+  // need to do. But we do need to remember if the quotient itself was exact,
+  // that is, if there was no remainder at all. That's needed in underflow
+  // handling.
+
+  // The rounding check wants to compare remainder with denominator/2. But of
+  // course in integers it's easier to compare 2*remainder with denominator. So
+  // we start by shifting the remainder left by 1, and in the process, set Z if
+  // it's exactly 0 (i.e. the result needs no rounding at all).
+  LSLS    r1, r1, #1
+  // Now trial-subtract the denominator. We don't do this at all if the result
+  // was exact. If we do do it, r1 goes negative precisely if we need to round
+  // up, which sets the C flag. (The previous instruction will have left C
+  // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if
+  // we're rounding up.)
+  SUBSNE  r1, r1, r0
+  // Recombine the quotient with the sign + exponent, and use the C flag from
+  // the previous instruction to increment the quotient if we're rounding up.
+  ADC     r0, r3, r2, LSL #23
+
+  // If we haven't either overflowed or underflowed, we're done. We can
+  // identify most of the safe cases by doing an unsigned comparison of the
+  // initial output exponent (in the top half of r2) with 0xFC: if 0 <= r2 <
+  // 0xFC0000 then we have neither underflow nor overflow.
+  //
+  // Rationale: the value in the top half of r2 had three chances to be
+  // incremented before becoming the exponent field of the actual output float.
+  // It was incremented if we found the numerator mantissa was >= the
+  // denominator (producing the value in the _bottom_ half of r2, which we just
+  // ADCed into the output). Then it gets unconditionally incremented again
+  // when the ADC combines it with the leading mantissa bit. And finally,
+  // round-up might increment it a third time. So 0xFC is the smallest value
+  // that can possibly turn into the overflowed value 0xFF after all those
+  // increments.
+  //
+  // On the underflow side, (top half of r2) = 0 corresponds to a value of 1 in
+  // the final result's exponent field (and then rounding might increase it
+  // further); if the exponent was less than that then r2 wraps round and looks
+  // like a very large positive integer from the point of view of this unsigned
+  // comparison.
+  CMP     r2, #0xFC0000
+  BXLO    lr
+
+  // The same comparison will have set the N and V flags to reflect the result
+  // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably
+  // distinguishes potential underflow (r2 is negative) from potential overflow
+  // (r2 is positive and at least 0xFC0000)
+  BGE     LOCAL_LABEL(overflow)
+
+  // Here we might or might not have underflow (but we know we don't have
+  // overflow). To check more carefully, we look at the _bottom_ half of r2,
+  // which contains the exponent after the first adjustment (for num >= denom),
+  // That is, it's still off by 1 (compensating for the leading quotient bit),
+  // and is also before rounding.
+  //
+  // We neglect the effect of rounding: division results that are tiny (less
+  // than the smallest normalised number) before rounding, but then round up to
+  // the smallest normal number, are an acceptable edge case to handle slowly.
+  // We pass those to funder without worrying about them.
+  //
+  // So we want to check whether the bottom half of r2 was negative. It would
+  // be nice to check bits 8-15 of it, but unfortunately, it's already been
+  // combined with the sign (at bit 8), so those bits don't tell us anything
+  // useful. Instead we look at the top 4 bits of the exponent field, i.e. the
+  // 0xF0 bits. The largest _non_-overflowing exponent that might reach here is
+  // less than 3, so it doesn't reach those bits; the smallest possible
+  // underflow, obtained by dividing the smallest denormal by the largest
+  // finite number, is -151 (before the leading bit increments it), which will
+  // set the low 8 bits of r2 to 0x69. That is, the 0xF0 nibble of r2 will be
+  // 0x60 or greater for a (pre-rounding) underflow, and zero for a
+  // non-underflow.
+
+  TST     r2, #0xF0
+  BXEQ    lr                    // no underflow after all; return
+
+  // Rebias the exponent for funder, which also corrects the sign bit.
+  ADD     r0, r0, #192 << 23
+  // Tell funder whether the true value is greater or less than the number in
+  // r0. This is obtained from the sign of the remainder (still in r1), with
+  // the only problem being that it's currently reversed. So negate r1 (leaving
+  // 0 at 0 to indicate exactness).
+  RSBS    r1, r1, #0
+  B     SYMBOL_NAME(__compiler_rt_funder)
+
+LOCAL_LABEL(overflow):
+  // Here we might or might not have overflow (but we know we don't have
+  // underflow). We must check whether we really have overflowed.
+  //
+  // For this it's easiest to check the exponent field in the actual output
+  // value in r0, after _all_ the adjustments have been completed. The largest
+  // overflowed exponent is 0x193, and the smallest exponent that can reach
+  // this is 0xFD (we checked against 0xFC above, but then the leading quotient
+  // bit incremented it). So it's enough to shift the output left by one
+  // (moving the exponent field to the top), increment it once more (so that
+  // the smallest overflowed exponent 0xFF wraps round to 0), and then compare
+  // against 0xFE000000 as an unsigned integer.
+  MOV     r12, r0, LSL #1
+  ADD     r12, r12, #1 << 24
+  CMP     r12, #0xFE << 24      // Check for exp = 253 or 254
+  BXHS    lr
+  // We have actual overflow. Rebias r0 to bring the exponent back into range,
+  // which ensures its sign is correct. Then make an infinity of that sign to
+  // return.
+  SUBS    r0, r0, #0xC0 << 23
+  MOVS    r12, #0xFF            // exponent of infinity
+  ORRS    r12, r12, r0, LSR #23 // exponent and sign at bottom of r12
+  MOVS    r0, r12, LSL #23      // shift it up to the top of r0 to return
+  BX      lr
+
+LOCAL_LABEL(uncommon):
+  // We come here from the start of the function if either input is an uncommon
+  // value: zero, denormal, infinity or NaN.
+  //
+  // We arrive here with r12 = 0xFF000000, and r2 containing the exponent of x
+  // in bits 16..23. But r3 doesn't necessarily contain the exponent of y,
+  // because the instruction that set it up was conditional. So first we
+  // unconditionally repeat it.
+  AND     r3, r12, r1, LSR #7
+
+  // In all cases not involving a NaN as output, the sign of the output is made
+  // in the same way as for finite numbers, as the XOR of the input signs. So
+  // repeat the sign setup from the main branch.
+  TEQ     r0, r1                // is the output sign bit 1?
+  ORRMI   r2, r2, #0x100        // if so, set bit 8 of r2
+
+  // Detect infinities and NaNs, by checking if either of r2 or r3 is at least
+  // 0xFF0000.
+  CMP     r2, #0xFF0000
+  CMPLO   r3, #0xFF0000
+  BHS     LOCAL_LABEL(inf_NaN)
+
+  // Now we know there are no infinities or NaNs, but there's at least one zero
+  // or denormal.
+  MOVS    r12, r1, LSL #1       // is y zero?
+  BEQ     LOCAL_LABEL(divbyzero)        // if so, go and handle division by zero
+  MOVS    r12, r0, LSL #1       // is x zero? (now we know that y is not)
+  MOVEQ   r0, r2, LSL #23       // if so, 0/nonzero is just 0 (of right sign)
+  BXEQ    lr
+
+  // Now we've eliminated zeroes as well, leaving only denormals: either x or
+  // y, or both, is a denormal. Call fnorm2 to convert both into a normalised
+  // mantissa and a (potentially small) exponent.
+  AND     r12, r2, #0x100       // save the result sign from r2
+  LSR     r2, #16               // shift extracted exponents down to bit 0
+  LSR     r3, #16               // where fnorm2 will expect them
+  PUSH    {r0, r1, r2, r3, r12, lr}
+  MOV     r0, sp                // tell fnorm2 where to find its data
+  BL      SYMBOL_NAME(__compiler_rt_fnorm2)
+  POP     {r0, r1, r2, r3, r12, lr}
+  LSL     r3, #16               // shift exponents back up to bit 16
+  ORR     r2, r12, r2, LSL #16  // and put the result sign back in r2
+
+  // Now rejoin the main code path, having finished the setup it will expect:
+  // swap x and y, and shift the fractions back down to the low 24 bits.
+  MOV     r12, r0, LSR #8
+  MOV     r0, r1, LSR #8
+  MOV     r1, r12
+  B       LOCAL_LABEL(div)
+
+LOCAL_LABEL(inf_NaN):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to fnan2 to propagate a NaN from the
+  // input.
+  MOV     r12, #0xFF000000
+  CMP     r12, r0, LSL #1       // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
+  CMP     r12, r1, LSL #1
+  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
+
+  // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and
+  // finite/inf = 0.
+
+  // If both operands are infinity, we return a NaN. Since we know at
+  // least _one_ is infinity, we can test this by checking if they're
+  // equal apart from the sign bits.
+  EOR     r3, r0, r1
+  LSLS    r3, #1                // were all bits of XOR zero other than top?
+  BEQ     LOCAL_LABEL(invalid)          // if so, both operands are infinity
+
+  // See if x is infinite
+  CMP     r12, r0, LSL #1       // (r0 << 1) == 0xFF000000?
+  BEQ     LOCAL_LABEL(infret)           // if so, infinity/finite = infinity
+
+  // y is infinite and x is not, so we return a zero of the
+  // combined sign.
+  EOR     r0, r0, r1            // calculate the right sign
+  AND     r0, r0, #0x80000000   // throw away everything else
+  BX      lr
+
+LOCAL_LABEL(divbyzero):
+  // Here, we know y is zero. But we don't know if x is zero or nonzero. So we
+  // might be calculating 0/0 (invalid operation, generating a NaN), or
+  // nonzero/0 (the IEEE "division by zero" exception, generating infinity).
+  MOVS    r12, r0, LSL #1       // is x zero too?
+  BEQ     LOCAL_LABEL(invalid)          // if so, go and return a NaN
+
+LOCAL_LABEL(infret):
+  // Here, we're either dividing infinity by a finite number, or dividing a
+  // nonzero number by 0. (Or both, if we're dividing infinity by 0.) In all
+  // these cases we return infinity with the sign from r2.
+  //
+  // If we were implementing IEEE exceptions, we'd have to separate these
+  // cases: infinity / finite is not an _exception_, it just returns infinity,
+  // whereas (finite and nonzero) / 0 is a division-by-zero exception. But here
+  // we're not implementing exceptions, so we can treat all three cases the
+  // same.
+  //
+  // r2 contains the output sign in bit 8, which is a convenient place to find
+  // it when making an infinity, because we can fill in the 8 exponent bits
+  // below that and then shift it left.
+  ORR     r2, r2, #0xff         // sign + maximum exponent
+  LSL     r0, r2, #23           // shift up to the top
+  BX      lr
+
+LOCAL_LABEL(invalid):
+  // Return the default NaN, from an invalid operation (either dividing
+  // infinity by infinity, or 0 by 0).
+  LDR     r0, =0x7FC00000
+  BX      lr
+
+// Finally, the lookup table for the initial reciprocal approximation.
+//
+// The table index is made from the top 7 bits of the denominator mantissa. But
+// the topmost bit is always 1, so only the other 6 bits vary. So it only has
+// 64 entries, not 128.
+//
+// Each table entry is a single byte, with its top bit set. So the table
+// entries correspond to the reciprocal of a 7-bit mantissa prefix scaled up by
+// 2^14, or the reciprocal of a whole 24-bit mantissa scaled up by 2^31.
+//
+// Each of these 64 entries corresponds to a large interval of possible
+// mantissas. For example, if the top 7 bits are 1000001 then the overall
+// mantissa could be anything from 0x820000 to 0x83FFFF. And because the output
+// of this table provides more bits than the input, there are several choices
+// of 8-bit reciprocal approximation for a number in that interval. The
+// reciprocal of 0x820000 starts with 0xFC plus a fraction, and the reciprocal
+// of 0x83FFFF starts with 0xF9 minus a fraction, so there are four reasonable
+// choices for that table entry: F9, FA, FB or FC. Which do we pick?
+//
+// The table below is generated by choosing whichever value minimises the
+// maximum possible error _after_ the approximation is improved by the
+// Newton-Raphson step. In the example above, we end up with FA.
+//
+// The Python code below will regenerate the table, complete with the per-entry
+// comments.
+
+/*
+
+for prefix in range(64, 128):
+    best = None
+
+    # Max and min 23-bit mantissas with this 7-bit prefix
+    mmin, mmax = prefix * 2**17, (prefix + 1) * 2**17 - 1
+
+    # Max and min table entry corresponding to the reciprocal of something in
+    # that range of mantissas: round up the reciprocal of mmax, and round down
+    # the reciprocal of mmin. Also clamp to the range [0x80,0xff], because
+    # 0x100 can't be used as a table entry due to not fitting in a byte, even
+    # though it's the exact reciprocal of the overall-smallest mantissa
+    # 0x800000.
+    gmin = max(128, (2**31 + mmin - 1) // mmax)
+    gmax = min(255, 2**31 // mmin)
+
+    # For each of those table entries, compute the result of starting from that
+    # value and doing a Newton-Raphson iteration, with the mantissa at each end
+    # of the mantissa interval. One of these will be the worst possible error.
+    # Choose the table entry whose worst error is as small as possible.
+    #
+    # (To find the extreme values of a more general function on an interval,
+    # you must consider its values not only at the interval endpoints but also
+    # any turning points within the interval. Here, the function has only one
+    # turning point, and by construction it takes value 0 there, so we needn't
+    # worry.)
+    g = max(
+        range(gmin, gmax + 1),
+        key=lambda g: min(
+            (g * (2**32 - d * g) / 2**23 - 2**39 / d) for d in [mmin, mmax]
+        ),
+    )
+
+    print(f"  .byte 0x{g:02x}  // input [0x{mmin:06x},0x{mmax:06x}]"
+          f", candidate outputs [0x{gmin:02x},0x{gmax:02x}]"
+    )
+
+*/
+
+  .p2align 2  // make sure we start on a 32-bit boundary, even in Thumb
+LOCAL_LABEL(tab):
+  .byte 0xfe  // input [0x800000,0x81ffff], candidate outputs [0xfd,0xff]
+  .byte 0xfa  // input [0x820000,0x83ffff], candidate outputs [0xf9,0xfc]
+  .byte 0xf6  // input [0x840000,0x85ffff], candidate outputs [0xf5,0xf8]
+  .byte 0xf3  // input [0x860000,0x87ffff], candidate outputs [0xf1,0xf4]
+  .byte 0xef  // input [0x880000,0x89ffff], candidate outputs [0xee,0xf0]
+  .byte 0xec  // input [0x8a0000,0x8bffff], candidate outputs [0xeb,0xed]
+  .byte 0xe8  // input [0x8c0000,0x8dffff], candidate outputs [0xe7,0xea]
+  .byte 0xe5  // input [0x8e0000,0x8fffff], candidate outputs [0xe4,0xe6]
+  .byte 0xe2  // input [0x900000,0x91ffff], candidate outputs [0xe1,0xe3]
+  .byte 0xdf  // input [0x920000,0x93ffff], candidate outputs [0xde,0xe0]
+  .byte 0xdc  // input [0x940000,0x95ffff], candidate outputs [0xdb,0xdd]
+  .byte 0xd9  // input [0x960000,0x97ffff], candidate outputs [0xd8,0xda]
+  .byte 0xd6  // input [0x980000,0x99ffff], candidate outputs [0xd5,0xd7]
+  .byte 0xd3  // input [0x9a0000,0x9bffff], candidate outputs [0xd3,0xd4]
+  .byte 0xd1  // input [0x9c0000,0x9dffff], candidate outputs [0xd0,0xd2]
+  .byte 0xce  // input [0x9e0000,0x9fffff], candidate outputs [0xcd,0xcf]
+  .byte 0xcc  // input [0xa00000,0xa1ffff], candidate outputs [0xcb,0xcc]
+  .byte 0xc9  // input [0xa20000,0xa3ffff], candidate outputs [0xc8,0xca]
+  .byte 0xc7  // input [0xa40000,0xa5ffff], candidate outputs [0xc6,0xc7]
+  .byte 0xc4  // input [0xa60000,0xa7ffff], candidate outputs [0xc4,0xc5]
+  .byte 0xc2  // input [0xa80000,0xa9ffff], candidate outputs [0xc1,0xc3]
+  .byte 0xc0  // input [0xaa0000,0xabffff], candidate outputs [0xbf,0xc0]
+  .byte 0xbd  // input [0xac0000,0xadffff], candidate outputs [0xbd,0xbe]
+  .byte 0xbb  // input [0xae0000,0xafffff], candidate outputs [0xbb,0xbc]
+  .byte 0xb9  // input [0xb00000,0xb1ffff], candidate outputs [0xb9,0xba]
+  .byte 0xb7  // input [0xb20000,0xb3ffff], candidate outputs [0xb7,0xb8]
+  .byte 0xb5  // input [0xb40000,0xb5ffff], candidate outputs [0xb5,0xb6]
+  .byte 0xb3  // input [0xb60000,0xb7ffff], candidate outputs [0xb3,0xb4]
+  .byte 0xb1  // input [0xb80000,0xb9ffff], candidate outputs [0xb1,0xb2]
+  .byte 0xaf  // input [0xba0000,0xbbffff], candidate outputs [0xaf,0xb0]
+  .byte 0xad  // input [0xbc0000,0xbdffff], candidate outputs [0xad,0xae]
+  .byte 0xac  // input [0xbe0000,0xbfffff], candidate outputs [0xab,0xac]
+  .byte 0xaa  // input [0xc00000,0xc1ffff], candidate outputs [0xa9,0xaa]
+  .byte 0xa8  // input [0xc20000,0xc3ffff], candidate outputs [0xa8,0xa8]
+  .byte 0xa6  // input [0xc40000,0xc5ffff], candidate outputs [0xa6,0xa7]
+  .byte 0xa5  // input [0xc60000,0xc7ffff], candidate outputs [0xa4,0xa5]
+  .byte 0xa3  // input [0xc80000,0xc9ffff], candidate outputs [0xa3,0xa3]
+  .byte 0xa1  // input [0xca0000,0xcbffff], candidate outputs [0xa1,0xa2]
+  .byte 0xa0  // input [0xcc0000,0xcdffff], candidate outputs [0xa0,0xa0]
+  .byte 0x9e  // input [0xce0000,0xcfffff], candidate outputs [0x9e,0x9f]
+  .byte 0x9d  // input [0xd00000,0xd1ffff], candidate outputs [0x9d,0x9d]
+  .byte 0x9b  // input [0xd20000,0xd3ffff], candidate outputs [0x9b,0x9c]
+  .byte 0x9a  // input [0xd40000,0xd5ffff], candidate outputs [0x9a,0x9a]
+  .byte 0x98  // input [0xd60000,0xd7ffff], candidate outputs [0x98,0x99]
+  .byte 0x97  // input [0xd80000,0xd9ffff], candidate outputs [0x97,0x97]
+  .byte 0x96  // input [0xda0000,0xdbffff], candidate outputs [0x95,0x96]
+  .byte 0x94  // input [0xdc0000,0xddffff], candidate outputs [0x94,0x94]
+  .byte 0x93  // input [0xde0000,0xdfffff], candidate outputs [0x93,0x93]
+  .byte 0x92  // input [0xe00000,0xe1ffff], candidate outputs [0x91,0x92]
+  .byte 0x90  // input [0xe20000,0xe3ffff], candidate outputs [0x90,0x90]
+  .byte 0x8f  // input [0xe40000,0xe5ffff], candidate outputs [0x8f,0x8f]
+  .byte 0x8e  // input [0xe60000,0xe7ffff], candidate outputs [0x8e,0x8e]
+  .byte 0x8d  // input [0xe80000,0xe9ffff], candidate outputs [0x8d,0x8d]
+  .byte 0x8b  // input [0xea0000,0xebffff], candidate outputs [0x8b,0x8c]
+  .byte 0x8a  // input [0xec0000,0xedffff], candidate outputs [0x8a,0x8a]
+  .byte 0x89  // input [0xee0000,0xefffff], candidate outputs [0x89,0x89]
+  .byte 0x88  // input [0xf00000,0xf1ffff], candidate outputs [0x88,0x88]
+  .byte 0x87  // input [0xf20000,0xf3ffff], candidate outputs [0x87,0x87]
+  .byte 0x86  // input [0xf40000,0xf5ffff], candidate outputs [0x86,0x86]
+  .byte 0x85  // input [0xf60000,0xf7ffff], candidate outputs [0x85,0x85]
+  .byte 0x84  // input [0xf80000,0xf9ffff], candidate outputs [0x84,0x84]
+  .byte 0x83  // input [0xfa0000,0xfbffff], candidate outputs [0x83,0x83]
+  .byte 0x82  // input [0xfc0000,0xfdffff], candidate outputs [0x82,0x82]
+  .byte 0x81  // input [0xfe0000,0xffffff], candidate outputs [0x80,0x81]
+
+END_COMPILERRT_FUNCTION(__divsf3)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c
new file mode 100644
index 0000000000000..0563fe1fde38f
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/fnan2.c
@@ -0,0 +1,38 @@
+//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This helper function is available for use by single-precision float
+// arithmetic implementations to handle propagating NaNs from the input
+// operands to the output, in a way that matches Arm hardware FP.
+//
+// On input, a and b are floating-point numbers in IEEE 754 encoding, and at
+// least one of them must be a NaN. The return value is the correct output NaN.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) {
+  // Make shifted-left copies of a and b to discard the sign bit. Then add 1 at
+  // the bit position where the quiet vs signalling bit ended up. This squashes
+  // all the signalling NaNs to the top of the range of 32-bit values, from
+  // 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values
+  // wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect
+  // a signalling NaN by asking if it's greater than 0xff800000, and a quiet
+  // one by asking if it's less than 0x00800000.
+  uint32_t aadj = (a << 1) + 0x00800000;
+  uint32_t badj = (b << 1) + 0x00800000;
+  if (aadj > 0xff800000)   // a is a signalling NaN?
+    return a | 0x00400000; //   if so, return it with the quiet bit set
+  if (badj > 0xff800000)   // b is a signalling NaN?
+    return b | 0x00400000; //   if so, return it with the quiet bit set
+  if (aadj < 0x00800000)   // a is a quiet NaN?
+    return a;              // if so, return it
+  else                     // expect (badj < 0x00800000)
+    return b;              // in that case b must be a quiet NaN
+}
diff --git a/compiler-rt/lib/builtins/arm/fnorm2.c b/compiler-rt/lib/builtins/arm/fnorm2.c
new file mode 100644
index 0000000000000..29eba1cbde59d
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/fnorm2.c
@@ -0,0 +1,62 @@
+//===-- fnorm2.c - Handle single-precision denormal inputs to binary op ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This helper function is available for use by single-precision float
+// arithmetic implementations, to handle denormal inputs on entry by
+// renormalizing the mantissa and modifying the exponent to match.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+// Structure containing the function's inputs and outputs.
+//
+// On entry: a, b are two input floating-point numbers, still in IEEE 754
+// encoding. expa and expb are the 8-bit exponents of those numbers, extracted
+// and shifted down to the low 8 bits of the word, with no other change.
+// Neither value should be zero, or have the maximum exponent (indicating an
+// infinity or NaN).
+//
+// On exit: each of a and b contains the mantissa of the input value, with the
+// leading 1 bit made explicit, and shifted up to the top of the word. If expa
+// was zero (indicating that a was denormal) then it is now represented as a
+// normalized number with an out-of-range exponent (zero or negative). The same
+// applies to expb and b.
+struct fnorm2 {
+  uint32_t a, b, expa, expb;
+};
+
+void __compiler_rt_fnorm2(struct fnorm2 *values) {
+  // Shift the mantissas of a and b to the right place to follow a leading 1 in
+  // the top bit, if there is one.
+  values->a <<= 8;
+  values->b <<= 8;
+
+  // Test if a is denormal.
+  if (values->expa == 0) {
+    // If so, decide how much further up to shift its mantissa, and adjust its
+    // exponent to match. This brings the leading 1 of the denormal mantissa to
+    // the top of values->a.
+    uint32_t shift = __builtin_clz(values->a);
+    values->a <<= shift;
+    values->expa = 1 - shift;
+  } else {
+    // Otherwise, leave the mantissa of a in its current position, and OR in
+    // the explicit leading 1.
+    values->a |= 0x80000000;
+  }
+
+  // Do the same operation on b.
+  if (values->expb == 0) {
+    uint32_t shift = __builtin_clz(values->b);
+    values->b <<= shift;
+    values->expb = 1 - shift;
+  } else {
+    values->b |= 0x80000000;
+  }
+}
diff --git a/compiler-rt/lib/builtins/arm/funder.c b/compiler-rt/lib/builtins/arm/funder.c
new file mode 100644
index 0000000000000..fd29e157328a3
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/funder.c
@@ -0,0 +1,78 @@
+//===-- funder.c - Handle single-precision floating-point underflow -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This helper function is available for use by single-precision float
+// arithmetic implementations to handle underflowed output values, if they were
+// computed in the form of a normalized mantissa and an out-of-range exponent.
+//
+// On input: x should be a complete IEEE 754 floating-point value representing
+// the desired output scaled up by 2^192 (the same value that would have been
+// passed to an underflow trap handler in IEEE 754:1985).
+//
+// This isn't enough information to re-round to the correct output denormal
+// without also knowing whether x itself has already been rounded, and which
+// way. 'errsign' gives this information, by indicating the sign of the value
+// (true result - x). That is, if errsign > 0 it means the true value was
+// larger (x was rounded down); if errsign < 0 then x was rounded up; if
+// errsign == 0 then x represents the _exact_ desired output value.
+//
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+#define SIGNBIT 0x80000000
+#define MANTSIZE 23
+#define BIAS 0xc0
+
+uint32_t __compiler_rt_funder(uint32_t x, uint32_t errsign) {
+  uint32_t sign = x & SIGNBIT;
+  uint32_t exponent = (x << 1) >> 24;
+
+  // Rule out exponents so small (or large!) that no denormalisation
+  // is needed.
+  if (exponent > BIAS) {
+    // Exponent 0xc1 or above means a normalised number got here by
+    // mistake, so we just remove the 0xc0 exponent bias and go
+    // straight home.
+    return x - (BIAS << MANTSIZE);
+  }
+  uint32_t bits_lost = BIAS + 1 - exponent;
+  if (bits_lost > MANTSIZE + 1) {
+    // The implicit leading 1 of the intermediate value's mantissa is
+    // below the lowest mantissa bit of a denormal by at least 2 bits.
+    // Round down to 0 unconditionally.
+    return sign;
+  }
+
+  // Make the full mantissa (with leading bit) at the top of the word.
+  uint32_t mantissa = 0x80000000 | (x << 8);
+  // Adjust by 1 depending on the sign of the error.
+  mantissa -= errsign >> 31;
+  mantissa += (-errsign) >> 31;
+
+  // Shift down to the output position, keeping the bits shifted off.
+  uint32_t outmant, shifted_off;
+  if (bits_lost == MANTSIZE + 1) {
+    // Special case for the exponent where we have to shift the whole
+    // of 'mantissa' off the bottom of the word.
+    outmant = 0;
+    shifted_off = mantissa;
+  } else {
+    outmant = mantissa >> (8 + bits_lost);
+    shifted_off = mantissa << (32 - (8 + bits_lost));
+  }
+
+  // Re-round.
+  if (shifted_off >> 31) {
+    outmant++;
+    if (!(shifted_off << 1))
+      outmant &= ~1; // halfway case: round to even
+  }
+
+  return sign | outmant;
+}
diff --git a/compiler-rt/lib/builtins/arm/mulsf3.S b/compiler-rt/lib/builtins/arm/mulsf3.S
new file mode 100644
index 0000000000000..dc1843615313a
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/mulsf3.S
@@ -0,0 +1,309 @@
+//===-- mulsf3.S - single-precision floating point multiplication ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float multiplication with the
+// IEEE-754 default rounding (to nearest, ties to even), in optimized AArch32
+// assembly language suitable to be built as either Arm or Thumb2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+
+  .syntax unified
+  .text
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
+
+DEFINE_COMPILERRT_FUNCTION(__mulsf3)
+
+  // Check if either input exponent is 00 or FF (i.e. not a normalized number),
+  // and if so, branch out of line. If we don't branch out of line, then we've
+  // also extracted the exponents of the input values r0/r1 into bits 16..23 of
+  // r2/r3. But if we do, then that hasn't necessarily been done (because the
+  // second AND might have been skipped).
+  MOV     r12, #0xFF0000
+  ANDS    r2, r12, r0, LSR #7  // sets Z if exponent of x is 0
+  ANDSNE  r3, r12, r1, LSR #7  // otherwise, sets Z if exponent of y is 0
+  TEQNE   r2, r12              // otherwise, sets Z if exponent of x is FF
+  TEQNE   r3, r12              // otherwise, sets Z if exponent of y is FF
+  BEQ     LOCAL_LABEL(uncommon)        // branch out of line to handle inf/NaN/0/denorm
+
+  // Calculate the sign of the result, and put it in an unused bit of r2.
+  TEQ     r0, r1               // sets N to the XOR of x and y's sign bits
+  ORRMI   r2, r2, #0x100       // if N set, set bit 8 of r2
+
+  // Move the input mantissas to the high end of r0/r1, each with its leading
+  // bit set explicitly, so that they're in the right form to be multiplied.
+  MOV     r12, #0x80000000
+  ORR     r0, r12, r0, LSL #8
+  ORR     r1, r12, r1, LSL #8
+
+  // Now we're ready to multiply mantissas. This is also the place we'll come
+  // back to after decoding denormal inputs. The denormal decoding will also
+  // have to set up the same register contents:
+  //  - decoded fractions at the top of r0 and r1
+  //  - exponents in r2 and r3, starting at bit 16
+  //  - output sign in r2 bit 8
+LOCAL_LABEL(mul):
+
+  // Here we multiply the mantissas, and compute the output exponent by adding
+  // the input exponents and rebiasing. These operations are interleaved to
+  // use a delay slot.
+  //
+  // The exponent is rebiased by subtracting 0x80, rather than the 0x7F you'd
+  // expect. That compensates for the leading bit of the mantissa overlapping
+  // it, when we recombine the exponent and mantissa by addition.
+  ADD     r2, r2, r3           // r2 has sum of exponents, freeing up r3
+  UMULL   r1, r3, r0, r1       // r3:r1 has the double-width product
+  SUB     r2, r2, #(0x80 << 16) // rebias the summed exponent
+
+  // Compress the double-word product into just the high-order word r3, by
+  // setting its bit 0 if any bit of the low-order word is nonzero. This
+  // changes the represented value, but not by nearly enough to affect
+  // rounding, because rounding only depends on the bit below the last output
+  // bit, and the general question of whether _any_ nonzero bit exists below
+  // that.
+  CMP     r1, #0                // if low word of full product is nonzero
+  ORRNE   r3, r3, #1            //   then set LSB of high word
+
+  // The two inputs to UMULL had their high bits set, that is, were at least
+  // 0x80000000. So the 64-bit product was at least 0x4000000000000000, i.e.
+  // the high bit of the product could be at the top of the word or one bit
+  // below. Check which, by experimentally shifting left, and then undoing it
+  // via RRX if we turned out to have shifted off a 1 bit.
+  LSLS    r3, r3, #1            // shift left, setting C to the bit shifted off
+  RRXCS   r3, r3                // if that bit was 1, put it back again
+
+  // That ensured the leading 1 bit of the product is now the top of r3, but
+  // also, set C if the leading 1 was _already_ in the top bit. So now we know
+  // whether to increment the exponent. The following instruction does the
+  // conditional increment (because it's ADC), but also, copies the exponent
+  // field from bit 16 of r2 into bit 0, so as to place it just below the
+  // output sign bit.
+  //
+  // So, if the number hasn't overflowed or underflowed, the low 9 bits of r2
+  // are exactly what we need to combine with the rounded mantissa. But the
+  // full output exponent (with extra bits) is still available in the high half
+  // of r2, so that we can check _whether_ we overflowed or underflowed.
+  ADC     r2, r2, r2, ASR #16
+
+  // Recombine the exponent and mantissa, doing most of the rounding as a side
+  // effect: we shift the mantissa right so as to put the round bit into C, and
+  // then we recombine with the exponent using ADC, to increment the mantissa
+  // if C was set.
+  MOVS    r12, r3, LSR #8
+  ADC     r0, r12, r2, LSL #23
+
+  // To complete the rounding, we must check for the round-to-even tiebreaking
+  // case, by checking if we're in the exact halfway case, which occurs if and
+  // only if we _did_ round up (we can tell this because C is still set from
+  // the MOVS), and also, no bit of r3 is set _below_ the round bit.
+  //
+  // We combine this with an overflow check, so that C ends up set if anything
+  // weird happened, and clear if we're completely finished and can return.
+  //
+  // The best instruction sequence for this part varies between Arm and Thumb.
+#if !__thumb__
+  // Arm state: if C was set then we check the low bits of r3, so that Z ends
+  // up set if we need to round to even.
+  //
+  // (We rely here on Z reliably being clear to begin with, because shifting
+  // down the output mantissa definitely gave a nonzero output. Also, the TST
+  // doesn't change C, so if Z does end up set, then C was also set.)
+  //
+  // Then, if we're not rounding to even, we do a CMP which sets C if there's
+  // been an overflow or an underflow. An overflow could occur for an output
+  // exponent as low as 0xFC, because we might increment the exponent by 1 when
+  // renormalizing, by another when recombining with the mantissa, and by one
+  // more if rounding up causes a carry off the top of the mantissa. An
+  // underflow occurs only if the output exponent is negative (because it's
+  // offset by 1, so an exponent of 0 will be incremented to 1), in which case
+  // the top 8 bits of r2 will all be set. Therefore, an unsigned comparison to
+  // see if r2 > 0xFC0000 will catch all overflow and underflow cases. It also
+  // catches a few very large cases that _don't_ quite overflow (exponents of
+  // 0xFC and above that don't get maximally unlucky); those will also be
+  // handled by the slow path.
+  TSTCS   r3, #0x7F
+  CMPNE   r2, #0xFC0000
+#else
+  // In Thumb, switching between different conditions has a higher cost due to
+  // the (implicit in this code) IT instructions, so we prefer a strategy that
+  // uses CC and CS conditions throughout, at the cost of requiring some extra
+  // cleanup instructions on the slow path.
+  //
+  // If C is set (and hence round-to-even is a possibility), the basic idea is
+  // to shift the full result word (r3) left by 25, leaving only its bottom 7
+  // bits, which are now the top 7 bits; then we want to set C iff these are 0.
+  //
+  // The "CMP x,y" instruction sets C if y > x (as unsigned integers). So this
+  // could be done in one instruction if only we had a register to use as x,
+  // which has 0 in the top 7 bits and at least one nonzero. Then we could
+  // compare that against the shifted-up value of r3, setting C precisely if
+  // the top 7 bits of y are greater than 0. And happily, we _do_ have such a
+  // register! r12 contains the shifted-down mantissa, which is guaranteed to
+  // have a 1 in bit 23, and 0 above that.
+  //
+  // The shift of r3 happens only in the second operand of the compare, so we
+  // don't lose the original value of r3 in this process.
+  //
+  // The check for over/underflow is exactly as in the Arm branch above, except
+  // based on a different condition.
+  CMPCS   r12, r3, LSL #25  // now C is set iff we're rounding to even
+  CMPCC   r2, #0xFC0000     // and now it's also set if we've over/underflowed
+#endif
+
+  // That's all the checks for difficult cases done. If C is clear, we can
+  // return.
+  BXCC    lr
+
+  // Now the slower path begins. We have to recover enough information to
+  // handle all of round-to-even, overflow and underflow.
+  //
+  // Round to even is the most likely of these, so we detect it first and
+  // handle it as fast as possible.
+
+#if __thumb__
+  // First, Thumb-specific compensation code. The Arm branch of the #if above
+  // will have set Z=0 to indicate round to even, but the Thumb branch didn't
+  // leave any unambiguous indicator of RTE, so we must retest by checking all
+  // the bits shifted off the bottom of the mantissa to see if they're exactly
+  // the half-way value.
+  LSL     r12, r3, #24           // r12 = round bit and everything below
+  CMP     r12, #0x80000000       // set Z if that is exactly 0x80000000
+#endif
+
+  // Now Z is clear iff we have already rounded up and now must replace that
+  // with rounding to even, which is done by just clearing the low bit of the
+  // mantissa.
+  BICEQ   r0, r0, #1
+
+  // Redo the over/underflow check (the same way as in both branches above),
+  // and if it doesn't report a danger, we can return the rounded-to-even
+  // answer.
+  CMP     r2, #0xFC0000         // check for over/underflow
+  BXCC    lr                    // and return if none.
+
+  // Now we only have overflow and underflow left to handle. First, find out
+  // which we're looking at. This is easy by testing the top bit of r2, but
+  // even easier by using the fact that the possible positive and negative
+  // values of r2 are widely enough separated that the 0xFC0000 subtracted by
+  // the CMP above won't have made any difference. So the N flag output from
+  // that comparison _already_ tells us which condition we have: if N is set we
+  // have underflow, and if N is clear, overflow.
+  BPL     LOCAL_LABEL(overflow)
+
+  // Here we're handling underflow.
+
+  // Add the IEEE 754:1985 exponent bias which funder will expect. This also
+  // brings the exponent back into a range where it can't possibly have carried
+  // into the sign bit, so the output sign will now be right.
+  ADD     r0, r0, #(0xC0 << 23)
+
+  // Determine whether we rounded up, down or not at all.
+  LSLS    r2, r3, #1              // input mantissa, without its leading 1
+  SUBS    r1, r2, r0, LSL #9      // subtract the output mantissa (likewise)
+
+  // And let funder handle the rest.
+  B     SYMBOL_NAME(__compiler_rt_funder)
+
+LOCAL_LABEL(overflow):
+  // We come here to handle overflow, but it's not guaranteed that an overflow
+  // has actually happened: our check on the fast path erred on the side of
+  // caution, by catching any output exponent that _could_ cause an overflow.
+  // So first check whether this really is an overflow, by extracting the
+  // output exponent. Exponent 0xFF, or anything that wrapped round to having
+  // the high bit clear, are overflows; 0xFE down to 0xFC are not overflows.
+  //
+  // The value in r0 is correct to return, if there's no overflow.
+  ADD     r12, r0, #(1 << 23)     // add 1 to the exponent so 0xFF wraps to 0
+  MOVS    r12, r12, LSL #1        // test the top bit of the modified value
+  BXMI    lr                      // if top bit is still 1, not an overflow
+
+  // This is an overflow, so we need to replace it with an appropriately signed
+  // infinity. First we correct the sign by applying a downward bias to the
+  // exponent (the one suggested in IEEE 754:1985, which was chosen to bring
+  // all possible overflowed results back into range).
+  SUBS    r0, r0, #(0xC0 << 23)
+
+  // Now the sign bit of r0 is correct. Replace everything else with the
+  // encoding of an infinity.
+  MOV     r1, #0xFF
+  AND     r0, r0, #0x80000000
+  ORR     r0, r0, r1, LSL #23
+  BX      lr
+
+LOCAL_LABEL(uncommon):
+  // Handle zeros, denorms, infinities and NaNs. We arrive here knowing that
+  // we've at least done the first _two_ instructions from the entry point,
+  // even if all the rest were skipped. So r2 contains the sign and exponent of
+  // x in bits 16..23, and r12 = 0xFF << 16.
+  //
+  // So, first repeat some instructions from the prologue, which were either
+  // conditionally skipped in the sequence leading to the branch, or skipped
+  // because they happened after the branch.
+  AND     r3, r12, r1, LSR #7  // get exponent of y in r3 bits 16..23
+  TEQ     r0, r1               // calculate the sign of the result
+  ORRMI   r2, r2, #0x100       // and put it in bit 8 of r2 as before
+
+  // Check for infinities and NaNs, by testing each of r2,r3 to see if it's at
+  // least 0xFF0000 (hence the exponent field is equal to 0xFF).
+  CMP     r2, r12
+  CMPLO   r3, r12
+  BHS     LOCAL_LABEL(inf_NaN)
+
+  // If we didn't take that branch, then we have only finite numbers, but at
+  // least one is denormal or zero. A zero makes the result easy (and also is a
+  // more likely input than a denormal), so check those first, as fast as
+  // possible.
+  MOVS    r12, r0, LSL #1          // Z set if x == 0
+  MOVSNE  r12, r1, LSL #1          // now Z set if either input is 0
+  MOVEQ   r0, r2, LSL #23          // in either case, make 0 of the output sign
+  BXEQ    lr                       // and return it
+
+  // Now we know we only have denormals to deal with. Call fnorm2 to sort
+  // them out, and rejoin the main code path above.
+  AND     r12, r2, #0x100          // save the result sign from r2
+  LSR     r2, #16                  // shift extracted exponents down to bit 0
+  LSR     r3, #16                  // where fnorm2 will expect them
+  PUSH    {r0, r1, r2, r3, r12, lr}
+  MOV     r0, sp                   // tell fnorm2 where to find its data
+  BL      SYMBOL_NAME(__compiler_rt_fnorm2)
+  POP     {r0, r1, r2, r3, r12, lr}
+  LSL     r3, #16                  // shift exponents back up to bit 16
+  ORR     r2, r12, r2, LSL #16     // and put the result sign back in r2
+  B       LOCAL_LABEL(mul)
+
+LOCAL_LABEL(inf_NaN):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to fnan2 which will propagate a NaN from
+  // the input; otherwise any multiplication involving infinity returns
+  // infinity, unless it's infinity * 0 which is an invalid operation and
+  // returns NaN again.
+  MOV     r12, #0xFF000000
+  CMP     r12, r0, LSL #1          // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
+  CMP     r12, r1, LSL #1
+  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
+
+  // NaNs are dealt with, so now we have at least one infinity. Check if the
+  // other operand is 0. This is conveniently done by XORing the two: because
+  // we know that the low 31 bits of one operand are exactly 0x7F800000, we can
+  // test if the low 31 bits of the other one are all 0 by checking whether the
+  // low 31 bits of (x XOR y) equal 0x7F800000.
+  EOR     r3, r0, r1
+  CMP     r12, r3, LSL #1          // if inf * 0, this sets Z
+  LSR     r0, r12, #1              // set up return value of +infinity
+  ORRNE   r0, r0, r2, LSL #23      // if not inf * 0, put on the output sign
+  ORREQ   r0, r0, #0x400000        // otherwise, set the 'quiet NaN' bit
+  BX      lr                       // and return
+
+END_COMPILERRT_FUNCTION(__mulsf3)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
new file mode 100644
index 0000000000000..62a6d71011003
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
@@ -0,0 +1,251 @@
+//===-- mulsf3.S - single-precision floating point multiplication ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float multiplication with the
+// IEEE-754 default rounding (to nearest, ties to even), in optimized Thumb1
+// assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+
+  .syntax unified
+  .text
+  .thumb
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
+
+DEFINE_COMPILERRT_FUNCTION(__mulsf3)
+  PUSH {r4,r5,r6,lr}
+
+  // Get exponents of the inputs, and check for uncommon values. In the process
+  // of this we also compute the sign, because it's marginally quicker that
+  // way.
+  LSLS    r2, r0, #1
+  ADCS    r4, r4, r4    // set r4[0] to sign bit of x
+  LSLS    r3, r1, #1
+  ADCS    r4, r4, r3    // set r4[0] to the output sign
+  LSRS    r2, r2, #24
+  BEQ     LOCAL_LABEL(zerodenorm0)   // still do the next LSRS
+  LSRS    r3, r3, #24
+  BEQ     LOCAL_LABEL(zerodenorm)
+  CMP     r2, #255
+  BEQ     LOCAL_LABEL(naninf)
+  CMP     r3, #255
+  BEQ     LOCAL_LABEL(naninf)
+  // Compute the output exponent. We'll be generating our product _without_ the
+  // leading bit, so we subtract 0x7f rather than 0x80.
+  ADDS    r2, r2, r3
+  SUBS    r2, r2, #0x7f
+  // Blank off everything above the mantissas.
+  LSLS    r0, r0, #9
+  LSLS    r1, r1, #9
+LOCAL_LABEL(normalised): // we may come back here from zerodenorm
+  LSRS    r0, r0, #9
+  LSRS    r1, r1, #9
+  // Multiply. r0 and r1 are the mantissas of the inputs but without their
+  // leading bits, so the product we want in principle is P=(r0+2^23)(r1+2^23).
+  // P is at most (2^24-1)^2 < 2^48, so it fits in a word and a half.
+  //
+  // The technique below will actually compute P - 2^46, by not adding on the
+  // term where the two 2^23 are multiplied. The 48-bit result will be
+  // delivered in two output registers, one containing its bottom 32 bits and
+  // the other containing the top 32, so they overlap in the middle 16 bits.
+  // This is done using only two multiply instructions and some bookkeeping.
+  //
+  // In the comments I'll write X and Y for the original input mantissas (again
+  // without their leading bits). I'll also decompose them as X = xh + xl and
+  // Y = yh + yl, where xl and yl are in the range 0..2^8-1 and xh,yh are
+  // multiples of 2^8.
+  ADDS    r5, r0, r1
+  LSLS    r5, r5, #7    // r5 = (X+Y) << 7
+  MOVS    r6, r0
+  MULS    r6, r1, r6    // r6 is congruent mod 2^32 to X*Y
+  LSRS    r0, r0, #8
+  LSRS    r1, r1, #8
+  MULS    r0, r1, r0
+  LSLS    r1, r0, #16   // r1 is congruent mod 2^32 to xh*yh
+  SUBS    r3, r6, r1    // now r3 is congruent mod 2^32 to
+                        //   (X*Y) - (xh*yh) = xh*yl + xl*yh + xl*yl
+                        //   and hence, since that is at most 0xfeff0001,
+                        //   is _exactly_ equal to that
+  ADDS    r0, r0, r5    // r0 is now (xh*yh + (X+Y)<<23) >> 16
+  LSRS    r1, r3, #16   // r1 is the top 16 bits of r3, i.e.
+                        //   (xh*yl + xl*yh + xl*yl) >> 16
+  ADDS    r3, r0, r1    // now r3 equals
+                        //   (xh*yh + xh*yl + xl*yh + xl*yl + (X+Y)<<23) >> 16
+                        //   i.e. (X*Y + (X+Y)<<23) >> 16,
+                        //   i.e. (the right answer) >> 16.
+                        // Meanwhile, r6 is exactly the bottom 32 bits of the
+                        // right answer.
+  // Renormalise if necessary.
+  LSRS    r1, r3, #30
+  BEQ     LOCAL_LABEL(norenorm)
+  // Here we have to do something fiddly. Renormalisation would be a trivial
+  // job if we had the leading mantissa bit - just note that it's one bit
+  // position above where it should be, and shift right by one. But without
+  // that bit, we currently have (2x - 2^30), and we want (x - 2^30); just
+  // shifting right would of course give us (x - 2^29), so we must subtract an
+  // extra 2^29 to fix this up.
+  LSRS    r3, r3, #1
+  MOVS    r1, #1
+  LSLS    r1, r1, #29
+  SUBS    r3, r3, r1
+  ADDS    r2, r2, #1
+LOCAL_LABEL(norenorm):
+  // Round and shift down to the right bit position.
+  LSRS    r0, r3, #7    // round bit goes into the carry flag
+  BCC     LOCAL_LABEL(rounded)
+  ADDS    r0, r0, #1
+  // In the round-up branch, we must also check if we have to round to even, by
+  // testing all the bits below the round bit. We will normally not expect to,
+  // so we do RTE by branching out of line and back again to avoid spending a
+  // branch in the common case.
+  LSLS    r5, r3, #32-7+1  // check the bits shifted out of r3 above
+  BNE     LOCAL_LABEL(rounded)          // if any is nonzero, we're not rounding to even
+  LSLS    r5, r6, #15      // check the bottom 17 bits of the low-order 32
+                           //   (enough to overlap r3 even if we renormalised)
+  BEQ     LOCAL_LABEL(rte)              // if any is nonzero, fall through, else RTE
+LOCAL_LABEL(rounded):
+  // Put on the sign and exponent, check for underflow and overflow, and
+  // return.
+  //
+  // Underflow occurs iff r2 (the output exponent) <= 0. Overflow occurs if
+  // it's >= 0xFF. (Also if it's 0xFE and we rounded up to overflow, but since
+  // this code doesn't report exceptions, we can ignore this case because it'll
+  // happen to return the right answer regardless). So we handle most of this
+  // via an unsigned comparison against 0xFF, which leaves the one case of a
+  // zero exponent that we have to filter separately by testing the Z flag
+  // after we shift the exponent back up into place.
+  CMP     r2, #0xFF    // check for most over/underflows
+  BHS     LOCAL_LABEL(outflow)      // ... and branch out of line for them
+  LSLS    r5, r2, #23  // shift the exponent into its output location
+  BEQ     LOCAL_LABEL(outflow)      // ... and branch again if it was 0
+  LSLS    r4, r4, #31  // shift the output sign into place
+  ORRS    r0, r0, r4   // and OR it in to the output
+  ADDS    r0, r0, r5   // OR in the mantissa
+  POP     {r4,r5,r6,pc} // and return
+
+LOCAL_LABEL(rte):
+  // Out-of-line handler for the round-to-even case. Clear the low mantissa bit
+  // and go back to the post-rounding code.
+  MOVS    r5, #1
+  BICS    r0, r0, r5
+  B       LOCAL_LABEL(rounded)
+
+LOCAL_LABEL(outflow):
+  CMP     r2, #0
+  BGT     LOCAL_LABEL(overflow)
+  // To handle underflow, we construct an intermediate value in the IEEE 754
+  // style (using our existing full-length mantissa, and bias the exponent by
+  // +0xC0), and indicate whether that intermediate was rounded up, down or not
+  // at all. Then call the helper function __funder, which will denormalise and
+  // re-round correctly.
+  LSLS    r1, r0, #7    // shift up the post-rounding mantissa
+  SUBS    r1, r3, r1    //   and subtract it from the pre-rounding version
+  LSLS    r6, r6, #15
+  CMP     r6, #1        // if the rest of the low bits are nonzero
+  ADCS    r1, r1, r1    //   then set an extra bit at the bottom
+
+  LSLS    r4, r4, #31
+  ORRS    r0, r0, r4    // put on the sign
+  ADDS    r2, r2, #192  // bias the exponent
+  LSLS    r3, r2, #23
+  ADDS    r0, r0, r3    // put on the biased exponent
+
+  BL      __funder
+  POP     {r4,r5,r6,pc}
+
+LOCAL_LABEL(overflow):
+  // Handle overflow by returning an infinity of the correct sign.
+  LSLS    r4, r4, #8    // move the sign up to bit 8
+  MOVS    r0, #0xff
+  ORRS    r0, r0, r4    // fill in an exponent just below it
+  LSLS    r0, r0, #23   // and shift those 9 bits up to the top of the word
+  POP     {r4,r5,r6,pc}
+
+  // We come here if there's at least one zero or denormal. On the fast path
+  // above, it was convenient to check these before checking NaNs and
+  // infinities, but NaNs take precedence, so now we're off the fast path, we
+  // must still check for those.
+  //
+  // At the main entry point 'zerodenorm' we want r2 and r3 to be the two input
+  // exponents. So if we branched after shifting-and-checking r2, we come to
+  // this earlier entry point 'zerodenorm0' so that we still shift r3.
+LOCAL_LABEL(zerodenorm0):
+  LSRS    r3, r3, #24
+LOCAL_LABEL(zerodenorm):
+  CMP     r2, #255
+  BEQ     LOCAL_LABEL(naninf)
+  CMP     r3, #255
+  BEQ     LOCAL_LABEL(naninf)
+  // Now we know we have at least one zero or denormal, and no NaN or infinity.
+  // Check if either input is actually zero. We've ruled out 0 * infinity by
+  // this point, so any zero input means we return zero of the correct sign.
+  LSLS    r6, r0, #1        // is one input zero?
+  BEQ     LOCAL_LABEL(zero)              // yes, go and return zero
+  LSLS    r6, r1, #1        // is the other one zero?
+  BNE     LOCAL_LABEL(denorm)            // if not, one must have been a denormal
+LOCAL_LABEL(zero):
+  LSLS    r0, r4, #31    // shift up the output sign to make the return value
+  POP     {r4,r5,r6,pc}
+
+  // Handle denormals via the helper function __fnorm2, which will break both
+  // inputs up into mantissa and exponent, renormalising and generating a
+  // negative exponent if necessary.
+LOCAL_LABEL(denorm):
+  PUSH    {r0,r1,r2,r3}
+  MOV     r0, sp
+  BL      __fnorm2
+  POP     {r0,r1,r2,r3}
+  // Convert __fnorm2's return values into the right form to rejoin the main
+  // code path.
+  LSLS    r0, r0, #1
+  LSLS    r1, r1, #1
+  ADDS    r2, r2, r3
+  SUBS    r2, r2, #0x7f
+  B       LOCAL_LABEL(normalised)
+
+  // We come here if at least one input is a NaN or infinity. There may still
+  // be zeroes (or denormals, though they make no difference at this stage).
+LOCAL_LABEL(naninf):
+  MOVS    r6, #0xff
+  LSLS    r6, r6, #24
+  LSLS    r5, r0, #1
+  CMP     r5, r6
+  BHI     LOCAL_LABEL(nan)              // first operand is a NaN
+  LSLS    r5, r1, #1
+  CMP     r5, r6
+  BHI     LOCAL_LABEL(nan)              // second operand is a NaN
+
+  // We know we have at least one infinity, and no NaNs. We might also have a
+  // zero, in which case we return the default quiet NaN.
+  LSLS    r6, r0, #1
+  BEQ     LOCAL_LABEL(infzero)          // if r0 is a zero, r1 must be inf
+  LSLS    r6, r1, #1
+  BEQ     LOCAL_LABEL(infzero)          // if r1 is a zero, r0 must be inf
+  // Otherwise we have infinity * infinity, or infinity * finite. Just return
+  // an appropriately signed infinity.
+  B       LOCAL_LABEL(overflow)         // reuse the code there
+
+  // We come here if at least one input is a NaN. Hand off to __fnan2, which
+  // propagates an appropriate NaN to the output, dealing with the special
+  // cases of signalling/quiet NaNs.
+LOCAL_LABEL(nan):
+  BL      __fnan2
+  POP     {r4,r5,r6,pc}
+
+  // Return a quiet NaN as the result of infinity * zero.
+LOCAL_LABEL(infzero):
+  LDR     r0, =0x7fc00000
+  POP     {r4,r5,r6,pc}
+
+END_COMPILERRT_FUNCTION(__mulsf3)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/Unit/divsf3_test.c b/compiler-rt/test/builtins/Unit/divsf3_test.c
index f8cb6169ac283..b5ebb2d9b5093 100644
--- a/compiler-rt/test/builtins/Unit/divsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/divsf3_test.c
@@ -1,7 +1,12 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 // RUN: %clang_builtins %s %librt -o %t && %run %t
 // REQUIRES: librt_has_divsf3
 
 #include "int_lib.h"
+#include <inttypes.h>
 #include <stdio.h>
 
 #include "fp_test.h"
@@ -9,107 +14,379 @@
 // Returns: a / b
 COMPILER_RT_ABI float __divsf3(float a, float b);
 
-int test__divsf3(float a, float b, uint32_t expected)
-{
-    float x = __divsf3(a, b);
-    int ret = compareResultF(x, expected);
+int test__divsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+  float a = fromRep32(a_rep), b = fromRep32(b_rep);
+  float x = __divsf3(a, b);
+  int ret = compareResultF(x, expected_rep);
 
-    if (ret){
-        printf("error in test__divsf3(%.20e, %.20e) = %.20e, "
-               "expected %.20e\n", a, b, x,
-               fromRep32(expected));
-    }
-    return ret;
+  if (ret) {
+    printf("error in test__divsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+           ", expected %08" PRIx32 "\n",
+           a_rep, b_rep, toRep32(x), expected_rep);
+  }
+  return ret;
 }
 
-int main()
-{
-    // Returned NaNs are assumed to be qNaN by default
-
-    // qNaN / any = qNaN
-    if (test__divsf3(makeQNaN32(), 3.F, UINT32_C(0x7fc00000)))
-      return 1;
-    // NaN / any = NaN
-    if (test__divsf3(makeNaN32(UINT32_C(0x123)), 3.F, UINT32_C(0x7fc00000)))
-      return 1;
-    // any / qNaN = qNaN
-    if (test__divsf3(3.F, makeQNaN32(), UINT32_C(0x7fc00000)))
-      return 1;
-    // any / NaN = NaN
-    if (test__divsf3(3.F, makeNaN32(UINT32_C(0x123)), UINT32_C(0x7fc00000)))
-      return 1;
-
-    // +Inf / positive = +Inf
-    if (test__divsf3(makeInf32(), 3.F, UINT32_C(0x7f800000)))
-      return 1;
-    // +Inf / negative = -Inf
-    if (test__divsf3(makeInf32(), -3.F, UINT32_C(0xff800000)))
-      return 1;
-    // -Inf / positive = -Inf
-    if (test__divsf3(makeNegativeInf32(), 3.F, UINT32_C(0xff800000)))
-      return 1;
-    // -Inf / negative = +Inf
-    if (test__divsf3(makeNegativeInf32(), -3.F, UINT32_C(0x7f800000)))
-      return 1;
-
-    // Inf / Inf = NaN
-    if (test__divsf3(makeInf32(), makeInf32(), UINT32_C(0x7fc00000)))
-      return 1;
-    // 0.0 / 0.0 = NaN
-    if (test__divsf3(+0x0.0p+0F, +0x0.0p+0F, UINT32_C(0x7fc00000)))
-      return 1;
-    // +0.0 / +Inf = +0.0
-    if (test__divsf3(+0x0.0p+0F, makeInf32(), UINT32_C(0x0)))
-      return 1;
-    // +Inf / +0.0 = +Inf
-    if (test__divsf3(makeInf32(), +0x0.0p+0F, UINT32_C(0x7f800000)))
-      return 1;
-
-    // positive / +0.0 = +Inf
-    if (test__divsf3(+1.F, +0x0.0p+0F, UINT32_C(0x7f800000)))
-      return 1;
-    // positive / -0.0 = -Inf
-    if (test__divsf3(+1.F, -0x0.0p+0F, UINT32_C(0xff800000)))
-      return 1;
-    // negative / +0.0 = -Inf
-    if (test__divsf3(-1.F, +0x0.0p+0F, UINT32_C(0xff800000)))
-      return 1;
-    // negative / -0.0 = +Inf
-    if (test__divsf3(-1.F, -0x0.0p+0F, UINT32_C(0x7f800000)))
-      return 1;
-
-    // 1/3
-    if (test__divsf3(1.F, 3.F, UINT32_C(0x3eaaaaab)))
-      return 1;
-    // smallest normal result
-    if (test__divsf3(0x1.0p-125F, 2.F, UINT32_C(0x00800000)))
-      return 1;
-
-    // divisor is exactly 1.0
-    if (test__divsf3(0x1.0p+0F, 0x1.0p+0F, UINT32_C(0x3f800000)))
-      return 1;
-    // divisor is truncated to exactly 1.0 in UQ1.15
-    if (test__divsf3(0x1.0p+0F, 0x1.0001p+0F, UINT32_C(0x3f7fff00)))
-      return 1;
+int main(void) {
+  int status = 0;
 
-    // smallest normal value divided by 2.0
-    if (test__divsf3(0x1.0p-126F, 2.0F, UINT32_C(0x00400000)))
-      return 1;
-    // smallest subnormal result
-    if (test__divsf3(0x1.0p-126F, 0x1p+23F, UINT32_C(0x00000001)))
-      return 1;
+  status |= test__divsf3(0x00000000, 0x00000001, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x007fffff, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x00800000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x00ffffff, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x3f800000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x40a00000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x7effffff, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x7f000000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x00000000, 0x80000002, 0x80000000);
+  status |= test__divsf3(0x00000000, 0x807fffff, 0x80000000);
+  status |= test__divsf3(0x00000000, 0x80800001, 0x80000000);
+  status |= test__divsf3(0x00000000, 0x81000000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xc0400000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xc0e00000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xfe7fffff, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xff000000, 0x80000000);
+  status |= test__divsf3(0x00000000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00000001, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x00000001, 0x3e000000, 0x00000008);
+  status |= test__divsf3(0x00000001, 0x3f000000, 0x00000002);
+  status |= test__divsf3(0x00000001, 0x40000000, 0x00000000);
+  status |= test__divsf3(0x00000001, 0x7f7fffff, 0x00000000);
+  status |= test__divsf3(0x00000001, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x00000001, 0xc0000000, 0x80000000);
+  status |= test__divsf3(0x00000001, 0xff7fffff, 0x80000000);
+  status |= test__divsf3(0x00000002, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x00000002, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00000009, 0x41100000, 0x00000001);
+  status |= test__divsf3(0x00000009, 0xc1100000, 0x80000001);
+  status |= test__divsf3(0x007ffff7, 0x3f7ffffe, 0x007ffff8);
+  status |= test__divsf3(0x007ffffe, 0x3f7ffffe, 0x007fffff);
+  status |= test__divsf3(0x007fffff, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x007fffff, 0x3b000000, 0x04fffffe);
+  status |= test__divsf3(0x007fffff, 0x3f000000, 0x00fffffe);
+  status |= test__divsf3(0x007fffff, 0x3f800000, 0x007fffff);
+  status |= test__divsf3(0x007fffff, 0x3f800002, 0x007ffffd);
+  status |= test__divsf3(0x007fffff, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x007fffff, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x007fffff, 0xbf800000, 0x807fffff);
+  status |= test__divsf3(0x007fffff, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00800000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x00800000, 0x3f800001, 0x007fffff);
+  status |= test__divsf3(0x00800000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x00800001, 0x3f800002, 0x007fffff);
+  status |= test__divsf3(0x00800001, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x00800001, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x00800002, 0x3f800006, 0x007ffffc);
+  status |= test__divsf3(0x00fffffe, 0x40000000, 0x007fffff);
+  status |= test__divsf3(0x00ffffff, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x00ffffff, 0x40000000, 0x00800000);
+  status |= test__divsf3(0x00ffffff, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x01000000, 0x00800000, 0x40000000);
+  status |= test__divsf3(0x01000000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x01000000, 0xc0000000, 0x80800000);
+  status |= test__divsf3(0x01000000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x01000001, 0x00800001, 0x40000000);
+  status |= test__divsf3(0x01000001, 0xc0000000, 0x80800001);
+  status |= test__divsf3(0x01000003, 0x80800003, 0xc0000000);
+  status |= test__divsf3(0x01000003, 0xc0000000, 0x80800003);
+  status |= test__divsf3(0x3f7ffff7, 0x3f7ffffb, 0x3f7ffffc);
+  status |= test__divsf3(0x3f7ffff7, 0x3f7ffffe, 0x3f7ffff9);
+  status |= test__divsf3(0x3f7ffff8, 0x3f7ffffc, 0x3f7ffffc);
+  status |= test__divsf3(0x3f7ffff8, 0x3f7ffffd, 0x3f7ffffb);
+  status |= test__divsf3(0x3f7ffffa, 0x3f7ffff9, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffb, 0x3f7ffff9, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7ffff9, 0x3f800002);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7ffffd, 0x3f7fffff);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7ffffe, 0x3f7ffffe);
+  status |= test__divsf3(0x3f7ffffc, 0x3f7fffff, 0x3f7ffffd);
+  status |= test__divsf3(0x3f7ffffc, 0x3f800001, 0x3f7ffffa);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7ffff9, 0x3f800002);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7ffffc, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7ffffe, 0x3f7fffff);
+  status |= test__divsf3(0x3f7ffffd, 0x3f7fffff, 0x3f7ffffe);
+  status |= test__divsf3(0x3f7ffffd, 0x3f800001, 0x3f7ffffb);
+  status |= test__divsf3(0x3f7ffffd, 0x3f800002, 0x3f7ffff9);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7ffff9, 0x3f800003);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7ffffc, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7ffffd, 0x3f800001);
+  status |= test__divsf3(0x3f7ffffe, 0x3f7fffff, 0x3f7fffff);
+  status |= test__divsf3(0x3f7ffffe, 0x3f800001, 0x3f7ffffc);
+  status |= test__divsf3(0x3f7ffffe, 0x3f800002, 0x3f7ffffa);
+  status |= test__divsf3(0x3f7ffffe, 0x3f800003, 0x3f7ffff8);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffff9, 0x3f800003);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffffc, 0x3f800002);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffffd, 0x3f800001);
+  status |= test__divsf3(0x3f7fffff, 0x3f7ffffe, 0x3f800001);
+  status |= test__divsf3(0x3f7fffff, 0x3f800001, 0x3f7ffffd);
+  status |= test__divsf3(0x3f7fffff, 0x3f800002, 0x3f7ffffb);
+  status |= test__divsf3(0x3f7fffff, 0x3f800003, 0x3f7ffff9);
+  status |= test__divsf3(0x3f7fffff, 0x3f800004, 0x3f7ffff7);
+  status |= test__divsf3(0x3f800000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x3f800000, 0x3f7ffff7, 0x3f800005);
+  status |= test__divsf3(0x3f800000, 0x3f7ffff8, 0x3f800004);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffb, 0x3f800003);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffc, 0x3f800002);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffd, 0x3f800002);
+  status |= test__divsf3(0x3f800000, 0x3f7ffffe, 0x3f800001);
+  status |= test__divsf3(0x3f800000, 0x3f7fffff, 0x3f800001);
+  status |= test__divsf3(0x3f800000, 0x3f800000, 0x3f800000);
+  status |= test__divsf3(0x3f800000, 0x3f800001, 0x3f7ffffe);
+  status |= test__divsf3(0x3f800000, 0x3f800002, 0x3f7ffffc);
+  status |= test__divsf3(0x3f800000, 0x3f800003, 0x3f7ffffa);
+  status |= test__divsf3(0x3f800000, 0x3f800004, 0x3f7ffff8);
+  status |= test__divsf3(0x3f800000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x3f800001, 0x3f7ffffb, 0x3f800004);
+  status |= test__divsf3(0x3f800001, 0x3f7ffffd, 0x3f800003);
+  status |= test__divsf3(0x3f800001, 0x3f7ffffe, 0x3f800002);
+  status |= test__divsf3(0x3f800001, 0x3f7fffff, 0x3f800002);
+  status |= test__divsf3(0x3f800001, 0x3f800002, 0x3f7ffffe);
+  status |= test__divsf3(0x3f800001, 0x3f800003, 0x3f7ffffc);
+  status |= test__divsf3(0x3f800002, 0x3f7ffffc, 0x3f800004);
+  status |= test__divsf3(0x3f800002, 0x3f7ffffd, 0x3f800004);
+  status |= test__divsf3(0x3f800002, 0x3f7ffffe, 0x3f800003);
+  status |= test__divsf3(0x3f800002, 0x3f7fffff, 0x3f800003);
+  status |= test__divsf3(0x3f800002, 0x3f800001, 0x3f800001);
+  status |= test__divsf3(0x3f800002, 0x3f800003, 0x3f7ffffe);
+  status |= test__divsf3(0x3f800003, 0x3f7ffffd, 0x3f800005);
+  status |= test__divsf3(0x3f800003, 0x3f7ffffe, 0x3f800004);
+  status |= test__divsf3(0x3f800003, 0x3f7fffff, 0x3f800004);
+  status |= test__divsf3(0x3f800003, 0x3f800001, 0x3f800002);
+  status |= test__divsf3(0x3f800004, 0x3f7ffffe, 0x3f800005);
+  status |= test__divsf3(0x3f800004, 0x3f800001, 0x3f800003);
+  status |= test__divsf3(0x3f800004, 0x3f800007, 0x3f7ffffa);
+  status |= test__divsf3(0x3f800005, 0x3f7fffff, 0x3f800006);
+  status |= test__divsf3(0x3f800006, 0x3f800008, 0x3f7ffffc);
+  status |= test__divsf3(0x3f800007, 0x3f800002, 0x3f800005);
+  status |= test__divsf3(0x3f800009, 0x3f800008, 0x3f800001);
+  status |= test__divsf3(0x40000000, 0x3f800000, 0x40000000);
+  status |= test__divsf3(0x40000000, 0xbf800000, 0xc0000000);
+  status |= test__divsf3(0x40400000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x40400000, 0xc0400000, 0xbf800000);
+  status |= test__divsf3(0x40400000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x40a00000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x40a00000, 0x40a00000, 0x3f800000);
+  status |= test__divsf3(0x40a00000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x40e00000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x40e00000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x41000000, 0x40000000, 0x40800000);
+  status |= test__divsf3(0x41100000, 0x40400000, 0x40400000);
+  status |= test__divsf3(0x7b000000, 0x05000000, 0x7f800000);
+  status |= test__divsf3(0x7e7fffff, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x7efffffd, 0xc0000000, 0xfe7ffffd);
+  status |= test__divsf3(0x7effffff, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x7effffff, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x7f000000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x7f000000, 0x007fffff, 0x7f800000);
+  status |= test__divsf3(0x7f000000, 0x3f000000, 0x7f800000);
+  status |= test__divsf3(0x7f000000, 0x40000000, 0x7e800000);
+  status |= test__divsf3(0x7f000000, 0x7f800000, 0x00000000);
+  status |= test__divsf3(0x7f000000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x7f000000, 0xbf000000, 0xff800000);
+  status |= test__divsf3(0x7f000000, 0xc0000000, 0xfe800000);
+  status |= test__divsf3(0x7f000000, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x7f000003, 0xfe800003, 0xc0000000);
+  status |= test__divsf3(0x7f7ffffd, 0x40800000, 0x7e7ffffd);
+  status |= test__divsf3(0x7f7ffffd, 0xc0800000, 0xfe7ffffd);
+  status |= test__divsf3(0x7f7fffff, 0x00000001, 0x7f800000);
+  status |= test__divsf3(0x7f7fffff, 0x3f7fffff, 0x7f800000);
+  status |= test__divsf3(0x7f7fffff, 0x7e7fffff, 0x40800000);
+  status |= test__divsf3(0x7f7fffff, 0x7effffff, 0x40000000);
+  status |= test__divsf3(0x7f7fffff, 0xc0000000, 0xfeffffff);
+  status |= test__divsf3(0x7f7fffff, 0xfe7fffff, 0xc0800000);
+  status |= test__divsf3(0x7f7fffff, 0xff800000, 0x80000000);
+  status |= test__divsf3(0x7f800000, 0x00000000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x00000001, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x007fffff, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x00800000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x00ffffff, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x3f800000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x40a00000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x7effffff, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x7f000000, 0x7f800000);
+  status |= test__divsf3(0x7f800000, 0x80000000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x80000002, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x807fffff, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x80800001, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0x81000000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xc0400000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xc0e00000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xfe7fffff, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xff000000, 0xff800000);
+  status |= test__divsf3(0x7f800000, 0xff7fffff, 0xff800000);
+  status |= test__divsf3(0x80000000, 0x00000003, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x007fffff, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x00800001, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x01000000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x40000000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x40c00000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x7e7fffff, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x7e800000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x80000000, 0x80000004, 0x00000000);
+  status |= test__divsf3(0x80000000, 0x807fffff, 0x00000000);
+  status |= test__divsf3(0x80000000, 0x80800000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0x80ffffff, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xc0800000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xc1000000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xfe800000, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xfeffffff, 0x00000000);
+  status |= test__divsf3(0x80000000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x80000001, 0x3f000000, 0x80000002);
+  status |= test__divsf3(0x80000001, 0x40000000, 0x80000000);
+  status |= test__divsf3(0x80000001, 0x7f7fffff, 0x80000000);
+  status |= test__divsf3(0x80000001, 0xc0000000, 0x00000000);
+  status |= test__divsf3(0x80000001, 0xff7fffff, 0x00000000);
+  status |= test__divsf3(0x80000003, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x80000003, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x80000004, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x80000004, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x807ffff8, 0x3f7ffffe, 0x807ffff9);
+  status |= test__divsf3(0x807fffff, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x807fffff, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x807fffff, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x807fffff, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x80800000, 0x3f800001, 0x807fffff);
+  status |= test__divsf3(0x80800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x80800000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x80800001, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x80800001, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x80ffffff, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0x80ffffff, 0xff800000, 0x00000000);
+  status |= test__divsf3(0x81000000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0x81000000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0x81000001, 0x00800001, 0xc0000000);
+  status |= test__divsf3(0x81000005, 0x00800005, 0xc0000000);
+  status |= test__divsf3(0xbf800000, 0x3f800000, 0xbf800000);
+  status |= test__divsf3(0xbf800000, 0xbf800000, 0x3f800000);
+  status |= test__divsf3(0xc0000000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xc0000000, 0x3f800000, 0xc0000000);
+  status |= test__divsf3(0xc0000000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xc0000000, 0xbf800000, 0x40000000);
+  status |= test__divsf3(0xc0800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xc0800000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xc0c00000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xc0c00000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xc0c00000, 0xc0400000, 0x40000000);
+  status |= test__divsf3(0xc0e00000, 0x40e00000, 0xbf800000);
+  status |= test__divsf3(0xc1000000, 0x40000000, 0xc0800000);
+  status |= test__divsf3(0xc1000000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xc1000000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xc1100000, 0xc0400000, 0x40400000);
+  status |= test__divsf3(0xfe7fffff, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xfe7fffff, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xfe800000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xfe800000, 0x7f800000, 0x80000000);
+  status |= test__divsf3(0xfe800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xfe800000, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xfeffffff, 0x40000000, 0xfe7fffff);
+  status |= test__divsf3(0xfeffffff, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xff000000, 0x3f000000, 0xff800000);
+  status |= test__divsf3(0xff000000, 0xbf000000, 0x7f800000);
+  status |= test__divsf3(0xff000001, 0x7e800001, 0xc0000000);
+  status |= test__divsf3(0xff7ffffd, 0x40800000, 0xfe7ffffd);
+  status |= test__divsf3(0xff7ffffd, 0xc0800000, 0x7e7ffffd);
+  status |= test__divsf3(0xff7fffff, 0x7e7fffff, 0xc0800000);
+  status |= test__divsf3(0xff7fffff, 0xfe7fffff, 0x40800000);
+  status |= test__divsf3(0xff7fffff, 0xff800000, 0x00000000);
+  status |= test__divsf3(0xff800000, 0x00000000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x00000003, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x007fffff, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x00800001, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x01000000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x40000000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x40c00000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x7e800000, 0xff800000);
+  status |= test__divsf3(0xff800000, 0x80000000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x80000004, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x807fffff, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x80800000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0x80ffffff, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xc0800000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xc1000000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xfe800000, 0x7f800000);
+  status |= test__divsf3(0xff800000, 0xff7fffff, 0x7f800000);
+  status |= test__divsf3(0x2cbed883, 0x333f6113, 0x38ff4953);
+  status |= test__divsf3(0x3f87ffff, 0x7f001000, 0x0043f781);
 
-    // some misc test cases obtained by fuzzing against h/w implementation
-    if (test__divsf3(-0x1.3e75e6p-108F, -0x1.cf372p+38F, UINT32_C(0x00000006)))
-      return 1;
-    if (test__divsf3(0x1.e77c54p+81F, -0x1.e77c52p-47F, UINT32_C(0xff800000)))
-      return 1;
-    if (test__divsf3(0x1.fffffep-126F, 2.F, UINT32_C(0x00800000)))
-      return 1;
+#if __thumb__ && !__thumb2__
+  // These tests depend on Arm-specific IEEE 754 implementation choices
+  // regarding NaNs, which are satisfied by arm/mulsf3.S but not guaranteed by
+  // other implementations:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand.
 
-    // test 1 / (1 - eps(0.5)) = 1 + eps(1)
-    if (test__divsf3(1.0F, 0x1.fffffep-1F, UINT32_C(0x3f800001)))
-      return 1;
+  status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+  status |= test__divsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+  status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+  status |= test__divsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+  status |= test__divsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+  status |= test__divsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+  status |= test__divsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+  status |= test__divsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+  status |= test__divsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+  status |= test__divsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+  status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+  status |= test__divsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+  status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000);
+  status |= test__divsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+  status |= test__divsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+  status |= test__divsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+  status |= test__divsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+  status |= test__divsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+  status |= test__divsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+  status |= test__divsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+  status |= test__divsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+  status |= test__divsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+  status |= test__divsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+  status |= test__divsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+  status |= test__divsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+  status |= test__divsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+  status |= test__divsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+  status |= test__divsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+  status |= test__divsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+  status |= test__divsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+  status |= test__divsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+  status |= test__divsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+  status |= test__divsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+  status |= test__divsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+  status |= test__divsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+  status |= test__divsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+  status |= test__divsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+  status |= test__divsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+  status |= test__divsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+  status |= test__divsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+  status |= test__divsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+  status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+  status |= test__divsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+  status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+  status |= test__divsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+  status |= test__divsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+  status |= test__divsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+  status |= test__divsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+  status |= test__divsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+  status |= test__divsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+  status |= test__divsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+  status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+  status |= test__divsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+  status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000);
+#endif // __arm__
 
-    return 0;
+  return status;
 }
diff --git a/compiler-rt/test/builtins/Unit/mulsf3_test.c b/compiler-rt/test/builtins/Unit/mulsf3_test.c
new file mode 100644
index 0000000000000..d18674e974149
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/mulsf3_test.c
@@ -0,0 +1,584 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_mulsf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// Returns: a + b
+COMPILER_RT_ABI float __mulsf3(float a, float b);
+
+int test__mulsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+  float a = fromRep32(a_rep), b = fromRep32(b_rep);
+  float x = __mulsf3(a, b);
+  int ret = compareResultF(x, expected_rep);
+
+  if (ret) {
+    printf("error in test__mulsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+           ", expected %08" PRIx32 "\n",
+           a_rep, b_rep, toRep32(x), expected_rep);
+  }
+  return ret;
+}
+
+int main() {
+  int status = 0;
+
+  status |= test__mulsf3(0x00000000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x007fffff, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x00ffffff, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x3f800000, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x7effffff, 0x00000000);
+  status |= test__mulsf3(0x00000000, 0x80000000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x80000002, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x807fffff, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x80800001, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0x81000000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xc0400000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xfe7fffff, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xff000000, 0x80000000);
+  status |= test__mulsf3(0x00000000, 0xff7fffff, 0x80000000);
+  status |= test__mulsf3(0x00000001, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x00000001, 0x00000001, 0x00000000);
+  status |= test__mulsf3(0x00000001, 0x3f000000, 0x00000000);
+  status |= test__mulsf3(0x00000001, 0x3f7fffff, 0x00000001);
+  status |= test__mulsf3(0x00000001, 0x3f800000, 0x00000001);
+  status |= test__mulsf3(0x00000001, 0x40000000, 0x00000002);
+  status |= test__mulsf3(0x00000001, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x00000001, 0xbf7fffff, 0x80000001);
+  status |= test__mulsf3(0x00000006, 0x3f000000, 0x00000003);
+  status |= test__mulsf3(0x00000006, 0xbf000000, 0x80000003);
+  status |= test__mulsf3(0x00000008, 0x3e000000, 0x00000001);
+  status |= test__mulsf3(0x007ffff7, 0x81000003, 0x80000000);
+  status |= test__mulsf3(0x007ffff8, 0x3f800001, 0x007ffff9);
+  status |= test__mulsf3(0x007ffff8, 0x3f800008, 0x00800000);
+  status |= test__mulsf3(0x007ffff8, 0xbf800001, 0x807ffff9);
+  status |= test__mulsf3(0x007ffff8, 0xbf800008, 0x80800000);
+  status |= test__mulsf3(0x007ffffc, 0x40000000, 0x00fffff8);
+  status |= test__mulsf3(0x007ffffe, 0x3f7ffffc, 0x007ffffc);
+  status |= test__mulsf3(0x007ffffe, 0x3f800001, 0x007fffff);
+  status |= test__mulsf3(0x007ffffe, 0xbf800001, 0x807fffff);
+  status |= test__mulsf3(0x007fffff, 0x007ffffe, 0x00000000);
+  status |= test__mulsf3(0x007fffff, 0x3f800001, 0x00800000);
+  status |= test__mulsf3(0x007fffff, 0x40000000, 0x00fffffe);
+  status |= test__mulsf3(0x00800000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x00800000, 0x00800000, 0x00000000);
+  status |= test__mulsf3(0x00800000, 0x3f7ffffe, 0x007fffff);
+  status |= test__mulsf3(0x00800000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x00800000, 0x80800000, 0x80000000);
+  status |= test__mulsf3(0x00800000, 0xc0000000, 0x81000000);
+  status |= test__mulsf3(0x00800001, 0x3f7ffffa, 0x007ffffe);
+  status |= test__mulsf3(0x00800001, 0x3f7ffffe, 0x00800000);
+  status |= test__mulsf3(0x00800001, 0xc0000000, 0x81000001);
+  status |= test__mulsf3(0x00800002, 0x3f7ffffc, 0x00800000);
+  status |= test__mulsf3(0x00fffff8, 0x3f000000, 0x007ffffc);
+  status |= test__mulsf3(0x00fffffe, 0x3f000000, 0x007fffff);
+  status |= test__mulsf3(0x00fffffe, 0xbf000000, 0x807fffff);
+  status |= test__mulsf3(0x00ffffff, 0x3f000000, 0x00800000);
+  status |= test__mulsf3(0x00ffffff, 0xbf000000, 0x80800000);
+  status |= test__mulsf3(0x3f000000, 0x80000001, 0x80000000);
+  status |= test__mulsf3(0x3f800000, 0x007ffffd, 0x007ffffd);
+  status |= test__mulsf3(0x3f800000, 0x01000003, 0x01000003);
+  status |= test__mulsf3(0x3f800000, 0x3f800000, 0x3f800000);
+  status |= test__mulsf3(0x3f800000, 0x40000000, 0x40000000);
+  status |= test__mulsf3(0x3f800000, 0x80000001, 0x80000001);
+  status |= test__mulsf3(0x3f800000, 0x80000009, 0x80000009);
+  status |= test__mulsf3(0x3f800001, 0x3f800001, 0x3f800002);
+  status |= test__mulsf3(0x3f800001, 0xbf800001, 0xbf800002);
+  status |= test__mulsf3(0x3f800001, 0xbf800002, 0xbf800003);
+  status |= test__mulsf3(0x3f800002, 0x3f800001, 0x3f800003);
+  status |= test__mulsf3(0x3f800002, 0x7f7ffffe, 0x7f800000);
+  status |= test__mulsf3(0x3f800001, 0x7f7ffffe, 0x7f800000);
+  status |= test__mulsf3(0x40000000, 0x00800000, 0x01000000);
+  status |= test__mulsf3(0x40000000, 0x00800001, 0x01000001);
+  status |= test__mulsf3(0x40000000, 0x3f800000, 0x40000000);
+  status |= test__mulsf3(0x40000000, 0x40400000, 0x40c00000);
+  status |= test__mulsf3(0x40000000, 0x7e800000, 0x7f000000);
+  status |= test__mulsf3(0x40000000, 0x7effffff, 0x7f7fffff);
+  status |= test__mulsf3(0x40000000, 0x807ffffd, 0x80fffffa);
+  status |= test__mulsf3(0x40000000, 0x80800003, 0x81000003);
+  status |= test__mulsf3(0x40000000, 0x80800005, 0x81000005);
+  status |= test__mulsf3(0x40000000, 0xbf800000, 0xc0000000);
+  status |= test__mulsf3(0x40000000, 0xfe7ffffd, 0xfefffffd);
+  status |= test__mulsf3(0x40000000, 0xfe800003, 0xff000003);
+  status |= test__mulsf3(0x403fffff, 0x3f7ffffd, 0x403ffffd);
+  status |= test__mulsf3(0x403fffff, 0x3f7ffffe, 0x403ffffe);
+  status |= test__mulsf3(0x403fffff, 0x3f7fffff, 0x403ffffe);
+  status |= test__mulsf3(0x403fffff, 0xbf7ffffd, 0xc03ffffd);
+  status |= test__mulsf3(0x40400000, 0x00000002, 0x00000006);
+  status |= test__mulsf3(0x40400000, 0x40000000, 0x40c00000);
+  status |= test__mulsf3(0x40400000, 0x40400000, 0x41100000);
+  status |= test__mulsf3(0x40400000, 0xc0000000, 0xc0c00000);
+  status |= test__mulsf3(0x40400001, 0x3f800001, 0x40400003);
+  status |= test__mulsf3(0x40400001, 0x3f800003, 0x40400006);
+  status |= test__mulsf3(0x40400001, 0xbf800003, 0xc0400006);
+  status |= test__mulsf3(0x40800000, 0x00000002, 0x00000008);
+  status |= test__mulsf3(0x40800000, 0x7e7fffff, 0x7f7fffff);
+  status |= test__mulsf3(0x40800000, 0xfe7fffff, 0xff7fffff);
+  status |= test__mulsf3(0x409fffff, 0x3f7fffff, 0x409ffffe);
+  status |= test__mulsf3(0x40a00000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x40a00000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x40a00001, 0x3f800001, 0x40a00002);
+  status |= test__mulsf3(0x40dfffff, 0x3f7ffffc, 0x40dffffc);
+  status |= test__mulsf3(0x40dfffff, 0x3f7fffff, 0x40dffffe);
+  status |= test__mulsf3(0x40e00000, 0x80000000, 0x80000000);
+  status |= test__mulsf3(0x40e00000, 0xff800000, 0xff800000);
+  status |= test__mulsf3(0x40e00001, 0x3f800001, 0x40e00003);
+  status |= test__mulsf3(0x7e7ffffd, 0x40800000, 0x7f7ffffd);
+  status |= test__mulsf3(0x7e7ffffd, 0xc0800000, 0xff7ffffd);
+  status |= test__mulsf3(0x7e800000, 0xc0000000, 0xff000000);
+  status |= test__mulsf3(0x7efffffd, 0xc0000008, 0xff800000);
+  status |= test__mulsf3(0x7effffff, 0xc0000000, 0xff7fffff);
+  status |= test__mulsf3(0x7f000000, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x7f000000, 0x40000000, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0x7f000000, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0x7f7ffffe, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x7f000000, 0xfe800000, 0xff800000);
+  status |= test__mulsf3(0x7f000000, 0xfe800004, 0xff800000);
+  status |= test__mulsf3(0x7f000000, 0xff000000, 0xff800000);
+  status |= test__mulsf3(0x7f000009, 0x7f7ffffa, 0x7f800000);
+  status |= test__mulsf3(0x7f000009, 0xc0c00002, 0xff800000);
+  status |= test__mulsf3(0x7f7fffff, 0x00000000, 0x00000000);
+  status |= test__mulsf3(0x7f800000, 0x007fffff, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x00ffffff, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x3f800000, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x7effffff, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x7f800000, 0x7f800000);
+  status |= test__mulsf3(0x7f800000, 0x80000002, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0x807fffff, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0x80800001, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0x81000000, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xc0400000, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xff000000, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xff7fffff, 0xff800000);
+  status |= test__mulsf3(0x7f800000, 0xff800000, 0xff800000);
+  status |= test__mulsf3(0x80000000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x80000000, 0x40c00000, 0x80000000);
+  status |= test__mulsf3(0x80000000, 0x7f7fffff, 0x80000000);
+  status |= test__mulsf3(0x80000000, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0x80000004, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0x80800000, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0xc1000000, 0x00000000);
+  status |= test__mulsf3(0x80000000, 0xfe800000, 0x00000000);
+  status |= test__mulsf3(0x80000001, 0x00000001, 0x80000000);
+  status |= test__mulsf3(0x80000001, 0x40a00000, 0x80000005);
+  status |= test__mulsf3(0x80000002, 0x3f800000, 0x80000002);
+  status |= test__mulsf3(0x80000003, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x80000003, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0x80000004, 0xbf800000, 0x00000004);
+  status |= test__mulsf3(0x80000008, 0x3e000000, 0x80000001);
+  status |= test__mulsf3(0x807ffff7, 0x01000003, 0x80000000);
+  status |= test__mulsf3(0x807ffff7, 0x3f800001, 0x807ffff8);
+  status |= test__mulsf3(0x807ffffd, 0xc0000000, 0x00fffffa);
+  status |= test__mulsf3(0x807fffff, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x807fffff, 0x3f800001, 0x80800000);
+  status |= test__mulsf3(0x807fffff, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0x807fffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0x807fffff, 0x807ffffe, 0x00000000);
+  status |= test__mulsf3(0x807fffff, 0xbf800000, 0x007fffff);
+  status |= test__mulsf3(0x807fffff, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0x80800000, 0x00800000, 0x80000000);
+  status |= test__mulsf3(0x80800000, 0x80800000, 0x00000000);
+  status |= test__mulsf3(0x80800001, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x80800001, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0x80800001, 0xbf800000, 0x00800001);
+  status |= test__mulsf3(0x80fffffc, 0x3f000000, 0x807ffffe);
+  status |= test__mulsf3(0x80fffffc, 0xbf000000, 0x007ffffe);
+  status |= test__mulsf3(0x80fffffe, 0x3f800000, 0x80fffffe);
+  status |= test__mulsf3(0x80ffffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0x80ffffff, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0x81000000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0x81000000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xbf7fffff, 0xff7fffff, 0x7f7ffffe);
+  status |= test__mulsf3(0xbf800000, 0x00000009, 0x80000009);
+  status |= test__mulsf3(0xbf800000, 0x00800009, 0x80800009);
+  status |= test__mulsf3(0xbf800000, 0x3f800000, 0xbf800000);
+  status |= test__mulsf3(0xbf800000, 0x40000000, 0xc0000000);
+  status |= test__mulsf3(0xbf800000, 0xbf800000, 0x3f800000);
+  status |= test__mulsf3(0xbf800000, 0xc0000000, 0x40000000);
+  status |= test__mulsf3(0xbf800001, 0x3f800001, 0xbf800002);
+  status |= test__mulsf3(0xbf800001, 0xbf800001, 0x3f800002);
+  status |= test__mulsf3(0xbf800001, 0xbf800002, 0x3f800003);
+  status |= test__mulsf3(0xbf800002, 0x3f800001, 0xbf800003);
+  status |= test__mulsf3(0xbf800002, 0xbf800001, 0x3f800003);
+  status |= test__mulsf3(0xc0000000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0xc0000000, 0x007ffffd, 0x80fffffa);
+  status |= test__mulsf3(0xc0000000, 0x00800001, 0x81000001);
+  status |= test__mulsf3(0xc0000000, 0x00800005, 0x81000005);
+  status |= test__mulsf3(0xc0000000, 0x00800009, 0x81000009);
+  status |= test__mulsf3(0xc0000000, 0x40400000, 0xc0c00000);
+  status |= test__mulsf3(0xc0000000, 0x7e7fffff, 0xfeffffff);
+  status |= test__mulsf3(0xc0000000, 0x7e800001, 0xff000001);
+  status |= test__mulsf3(0xc0000000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xc0000000, 0xbf800000, 0x40000000);
+  status |= test__mulsf3(0xc0000000, 0xc0400000, 0x40c00000);
+  status |= test__mulsf3(0xc03ffffe, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xc03fffff, 0x3f7fffff, 0xc03ffffe);
+  status |= test__mulsf3(0xc0400000, 0x40400000, 0xc1100000);
+  status |= test__mulsf3(0xc0400000, 0xc0000000, 0x40c00000);
+  status |= test__mulsf3(0xc0400000, 0xc0400000, 0x41100000);
+  status |= test__mulsf3(0xc0400000, 0xff000000, 0x7f800000);
+  status |= test__mulsf3(0xc0400001, 0x3f800001, 0xc0400003);
+  status |= test__mulsf3(0xc0800000, 0x7e7fffff, 0xff7fffff);
+  status |= test__mulsf3(0xc0800000, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0xc0800000, 0xfe7fffff, 0x7f7fffff);
+  status |= test__mulsf3(0xc0800000, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0xc09ffffe, 0xff000000, 0x7f800000);
+  status |= test__mulsf3(0xc09fffff, 0xbf7fffff, 0x409ffffe);
+  status |= test__mulsf3(0xc0a00001, 0xbf800001, 0x40a00002);
+  status |= test__mulsf3(0xc0dffff9, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xc1100000, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xc1100001, 0xff000000, 0x7f800000);
+  status |= test__mulsf3(0xfe7ffff9, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xfe7ffff9, 0xc07fffff, 0x7f7ffff8);
+  status |= test__mulsf3(0xfe7ffffd, 0x40800000, 0xff7ffffd);
+  status |= test__mulsf3(0xfe7ffffd, 0xc0800000, 0x7f7ffffd);
+  status |= test__mulsf3(0xfe7fffff, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0xfe7fffff, 0x40000001, 0xff000000);
+  status |= test__mulsf3(0xfe7fffff, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xfe800000, 0x00000000, 0x80000000);
+  status |= test__mulsf3(0xfe800000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xfefffff7, 0x7e800001, 0xff800000);
+  status |= test__mulsf3(0xfeffffff, 0x3f800001, 0xff000000);
+  status |= test__mulsf3(0xfeffffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0xff000005, 0xff000001, 0x7f800000);
+  status |= test__mulsf3(0xff7ffffd, 0x7f000000, 0xff800000);
+  status |= test__mulsf3(0xff7ffffd, 0xc0400001, 0x7f800000);
+  status |= test__mulsf3(0xff7ffffd, 0xff000001, 0x7f800000);
+  status |= test__mulsf3(0xff7fffff, 0x80000000, 0x00000000);
+  status |= test__mulsf3(0xff7fffff, 0xff7fffff, 0x7f800000);
+  status |= test__mulsf3(0xff7fffff, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0x40c00000, 0xff800000);
+  status |= test__mulsf3(0xff800000, 0x7f800000, 0xff800000);
+  status |= test__mulsf3(0xff800000, 0x80000004, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0x80800000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0xc1000000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0xfe800000, 0x7f800000);
+  status |= test__mulsf3(0xff800000, 0xff800000, 0x7f800000);
+  status |= test__mulsf3(0x3089705f, 0x0ef36390, 0x0041558f);
+  status |= test__mulsf3(0x3089705f, 0x0e936390, 0x0027907d);
+  status |= test__mulsf3(0x3109705f, 0x0ef36390, 0x0082ab1e);
+  status |= test__mulsf3(0x3109705f, 0x0e936390, 0x004f20fa);
+  status |= test__mulsf3(0x3189705f, 0x0ef36390, 0x0102ab1e);
+  status |= test__mulsf3(0x3189705f, 0x0e936390, 0x009e41f5);
+  status |= test__mulsf3(0xb089705f, 0x0ef36390, 0x8041558f);
+  status |= test__mulsf3(0xb089705f, 0x0e936390, 0x8027907d);
+  status |= test__mulsf3(0xb109705f, 0x0ef36390, 0x8082ab1e);
+  status |= test__mulsf3(0xb109705f, 0x0e936390, 0x804f20fa);
+  status |= test__mulsf3(0xb189705f, 0x0ef36390, 0x8102ab1e);
+  status |= test__mulsf3(0xb189705f, 0x0e936390, 0x809e41f5);
+  status |= test__mulsf3(0x3089705f, 0x8ef36390, 0x8041558f);
+  status |= test__mulsf3(0x3089705f, 0x8e936390, 0x8027907d);
+  status |= test__mulsf3(0x3109705f, 0x8ef36390, 0x8082ab1e);
+  status |= test__mulsf3(0x3109705f, 0x8e936390, 0x804f20fa);
+  status |= test__mulsf3(0x3189705f, 0x8ef36390, 0x8102ab1e);
+  status |= test__mulsf3(0x3189705f, 0x8e936390, 0x809e41f5);
+  status |= test__mulsf3(0xb089705f, 0x8ef36390, 0x0041558f);
+  status |= test__mulsf3(0xb089705f, 0x8e936390, 0x0027907d);
+  status |= test__mulsf3(0xb109705f, 0x8ef36390, 0x0082ab1e);
+  status |= test__mulsf3(0xb109705f, 0x8e936390, 0x004f20fa);
+  status |= test__mulsf3(0xb189705f, 0x8ef36390, 0x0102ab1e);
+  status |= test__mulsf3(0xb189705f, 0x8e936390, 0x009e41f5);
+  status |= test__mulsf3(0x1f800001, 0x1fc00000, 0x00300000);
+  status |= test__mulsf3(0x1f800003, 0x1fc00000, 0x00300001);
+  status |= test__mulsf3(0x1f800001, 0x1fc00800, 0x00300200);
+  status |= test__mulsf3(0x1f800003, 0x1fc00800, 0x00300201);
+  status |= test__mulsf3(0x36e4588a, 0x29b47cbd, 0x2120fd85);
+  status |= test__mulsf3(0x3fea3b26, 0x3f400000, 0x3fafac5c);
+  status |= test__mulsf3(0x6fea3b26, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20ea3b26, 0x1ec00000, 0x0057d62e);
+  status |= test__mulsf3(0x3f8f11bb, 0x3fc00000, 0x3fd69a98);
+  status |= test__mulsf3(0x6f8f11bb, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f400000, 0x006b4d4c);
+  status |= test__mulsf3(0x3f8f11bb, 0x3f800000, 0x3f8f11bb);
+  status |= test__mulsf3(0x6f8f11bb, 0x4f800000, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f000000, 0x004788de);
+  status |= test__mulsf3(0x3f8f11bb, 0x3fd7f48d, 0x3ff1611f);
+  status |= test__mulsf3(0x6f8f11bb, 0x4fd7f48d, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f57f48d, 0x0078b090);
+  status |= test__mulsf3(0x3f8f11bb, 0x3fa80b73, 0x3fbbd412);
+  status |= test__mulsf3(0x6f8f11bb, 0x4fa80b73, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f280b73, 0x005dea09);
+  status |= test__mulsf3(0x3f8f11bb, 0x3f97f48d, 0x3fa9d842);
+  status |= test__mulsf3(0x6f8f11bb, 0x4f97f48d, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1f17f48d, 0x0054ec21);
+  status |= test__mulsf3(0x3f8f11bb, 0x3f680b73, 0x3f81ae78);
+  status |= test__mulsf3(0x6f8f11bb, 0x4f680b73, 0x7f800000);
+  status |= test__mulsf3(0x208f11bb, 0x1ee80b73, 0x0040d73c);
+  status |= test__mulsf3(0x3fff5dd8, 0x3f600000, 0x3fdf721d);
+  status |= test__mulsf3(0x6fff5dd8, 0x4f600000, 0x7f800000);
+  status |= test__mulsf3(0x20ff5dd8, 0x1ee00000, 0x006fb90e);
+  status |= test__mulsf3(0x3fff5dd8, 0x3f100000, 0x3f8fa4ca);
+  status |= test__mulsf3(0x6fff5dd8, 0x4f100000, 0x7f800000);
+  status |= test__mulsf3(0x20ff5dd8, 0x1e900000, 0x0047d265);
+  status |= test__mulsf3(0x3fffe96b, 0x3f7efb43, 0x3ffee4c5);
+  status |= test__mulsf3(0x6fffe96b, 0x4f7efb43, 0x7f800000);
+  status |= test__mulsf3(0x20ffe96b, 0x1efefb43, 0x007f7263);
+  status |= test__mulsf3(0x3fffe96b, 0x3f0104bd, 0x3f80f95b);
+  status |= test__mulsf3(0x6fffe96b, 0x4f0104bd, 0x7f800000);
+  status |= test__mulsf3(0x20ffe96b, 0x1e8104bd, 0x00407cae);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3fa6edf9, 0x3fbb72aa);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4fa6edf9, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1f26edf9, 0x005db955);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3fd91207, 0x3ff3c07b);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4fd91207, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1f591207, 0x0079e03d);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3f991207, 0x3fabe29f);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4f991207, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1f191207, 0x0055f150);
+  status |= test__mulsf3(0x3f8fbbb7, 0x3f66edf9, 0x3f81a843);
+  status |= test__mulsf3(0x6f8fbbb7, 0x4f66edf9, 0x7f800000);
+  status |= test__mulsf3(0x208fbbb7, 0x1ee6edf9, 0x0040d421);
+  status |= test__mulsf3(0x3fdb62f3, 0x3f7879c5, 0x3fd4f036);
+  status |= test__mulsf3(0x6fdb62f3, 0x4f7879c5, 0x7f800000);
+  status |= test__mulsf3(0x20db62f3, 0x1ef879c5, 0x006a781b);
+  status |= test__mulsf3(0x3faaea45, 0x3f8b6773, 0x3fba2489);
+  status |= test__mulsf3(0x6faaea45, 0x4f8b6773, 0x7f800000);
+  status |= test__mulsf3(0x20aaea45, 0x1f0b6773, 0x005d1244);
+  status |= test__mulsf3(0x3fafa7ec, 0x3f900000, 0x3fc59cea);
+  status |= test__mulsf3(0x6fafa7ec, 0x4f900000, 0x7f800000);
+  status |= test__mulsf3(0x20afa7ec, 0x1f100000, 0x0062ce75);
+  status |= test__mulsf3(0x3fcf8c8d, 0x3f271645, 0x3f8776be);
+  status |= test__mulsf3(0x6fcf8c8d, 0x4f271645, 0x7f800000);
+  status |= test__mulsf3(0x20cf8c8d, 0x1ea71645, 0x0043bb5f);
+  status |= test__mulsf3(0x3fc173ef, 0x3f901b0f, 0x3fd9cb52);
+  status |= test__mulsf3(0x6fc173ef, 0x4f901b0f, 0x7f800000);
+  status |= test__mulsf3(0x20c173ef, 0x1f101b0f, 0x006ce5a9);
+  status |= test__mulsf3(0x3fb48d33, 0x3f4a35fb, 0x3f8e9d7d);
+  status |= test__mulsf3(0x6fb48d33, 0x4f4a35fb, 0x7f800000);
+  status |= test__mulsf3(0x20b48d33, 0x1eca35fb, 0x00474ebe);
+  status |= test__mulsf3(0x3fc6f87b, 0x3f65d94d, 0x3fb2a52a);
+  status |= test__mulsf3(0x6fc6f87b, 0x4f65d94d, 0x7f800000);
+  status |= test__mulsf3(0x20c6f87b, 0x1ee5d94d, 0x00595295);
+  status |= test__mulsf3(0x3f860ae7, 0x3f969729, 0x3f9db312);
+  status |= test__mulsf3(0x6f860ae7, 0x4f969729, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f169729, 0x004ed989);
+  status |= test__mulsf3(0x3f860ae7, 0x3fc00000, 0x3fc9105a);
+  status |= test__mulsf3(0x6f860ae7, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f400000, 0x0064882d);
+  status |= test__mulsf3(0x3f860ae7, 0x3fe968d7, 0x3ff46da3);
+  status |= test__mulsf3(0x6f860ae7, 0x4fe968d7, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f6968d7, 0x007a36d1);
+  status |= test__mulsf3(0x3f860ae7, 0x3f800000, 0x3f860ae7);
+  status |= test__mulsf3(0x6f860ae7, 0x4f800000, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f000000, 0x00430574);
+  status |= test__mulsf3(0x3f860ae7, 0x3fa968d7, 0x3fb1682f);
+  status |= test__mulsf3(0x6f860ae7, 0x4fa968d7, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f2968d7, 0x0058b418);
+  status |= test__mulsf3(0x3f860ae7, 0x3fd69729, 0x3fe0b886);
+  status |= test__mulsf3(0x6f860ae7, 0x4fd69729, 0x7f800000);
+  status |= test__mulsf3(0x20860ae7, 0x1f569729, 0x00705c43);
+  status |= test__mulsf3(0x3f9aecdd, 0x3fb14b75, 0x3fd696de);
+  status |= test__mulsf3(0x6f9aecdd, 0x4fb14b75, 0x7f800000);
+  status |= test__mulsf3(0x209aecdd, 0x1f314b75, 0x006b4b6f);
+  status |= test__mulsf3(0x3f9aecdd, 0x3fceb48b, 0x3ffa2fb9);
+  status |= test__mulsf3(0x6f9aecdd, 0x4fceb48b, 0x7f800000);
+  status |= test__mulsf3(0x209aecdd, 0x1f4eb48b, 0x007d17dc);
+  status |= test__mulsf3(0x3f9aecdd, 0x3fc00000, 0x3fe8634c);
+  status |= test__mulsf3(0x6f9aecdd, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x209aecdd, 0x1f400000, 0x007431a6);
+  status |= test__mulsf3(0x3fd65dc6, 0x3f400000, 0x3fa0c654);
+  status |= test__mulsf3(0x6fd65dc6, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20d65dc6, 0x1ec00000, 0x0050632a);
+  status |= test__mulsf3(0x3feecf03, 0x3f5f93ab, 0x3fd09014);
+  status |= test__mulsf3(0x6feecf03, 0x4f5f93ab, 0x7f800000);
+  status |= test__mulsf3(0x20eecf03, 0x1edf93ab, 0x0068480a);
+  status |= test__mulsf3(0x3feecf03, 0x3f206c55, 0x3f95a670);
+  status |= test__mulsf3(0x6feecf03, 0x4f206c55, 0x7f800000);
+  status |= test__mulsf3(0x20eecf03, 0x1ea06c55, 0x004ad338);
+  status |= test__mulsf3(0x3f98feed, 0x3f60f11b, 0x3f866f27);
+  status |= test__mulsf3(0x6f98feed, 0x4f60f11b, 0x7f800000);
+  status |= test__mulsf3(0x2098feed, 0x1ee0f11b, 0x00433794);
+  status |= test__mulsf3(0x3f9a1b9d, 0x3f9c42b5, 0x3fbc21f8);
+  status |= test__mulsf3(0x6f9a1b9d, 0x4f9c42b5, 0x7f800000);
+  status |= test__mulsf3(0x209a1b9d, 0x1f1c42b5, 0x005e10fc);
+  status |= test__mulsf3(0x3f9a1b9d, 0x3f5c42b5, 0x3f8497e3);
+  status |= test__mulsf3(0x6f9a1b9d, 0x4f5c42b5, 0x7f800000);
+  status |= test__mulsf3(0x209a1b9d, 0x1edc42b5, 0x00424bf2);
+  status |= test__mulsf3(0x3f947044, 0x3f600000, 0x3f81e23c);
+  status |= test__mulsf3(0x6f947044, 0x4f600000, 0x7f800000);
+  status |= test__mulsf3(0x20947044, 0x1ee00000, 0x0040f11e);
+  status |= test__mulsf3(0x3fa3fb77, 0x3f6eb1b9, 0x3f98e5a0);
+  status |= test__mulsf3(0x6fa3fb77, 0x4f6eb1b9, 0x7f800000);
+  status |= test__mulsf3(0x20a3fb77, 0x1eeeb1b9, 0x004c72d0);
+  status |= test__mulsf3(0x3fb291df, 0x3f466a1f, 0x3f8a66d9);
+  status |= test__mulsf3(0x6fb291df, 0x4f466a1f, 0x7f800000);
+  status |= test__mulsf3(0x20b291df, 0x1ec66a1f, 0x0045336c);
+  status |= test__mulsf3(0x3fde13d5, 0x3f6b7283, 0x3fcc3f8b);
+  status |= test__mulsf3(0x6fde13d5, 0x4f6b7283, 0x7f800000);
+  status |= test__mulsf3(0x20de13d5, 0x1eeb7283, 0x00661fc5);
+  status |= test__mulsf3(0x3fd5b211, 0x3f80810f, 0x3fd68987);
+  status |= test__mulsf3(0x6fd5b211, 0x4f80810f, 0x7f800000);
+  status |= test__mulsf3(0x20d5b211, 0x1f00810f, 0x006b44c4);
+  status |= test__mulsf3(0x3fd5b211, 0x3f3f7ef1, 0x3f9fd9d2);
+  status |= test__mulsf3(0x6fd5b211, 0x4f3f7ef1, 0x7f800000);
+  status |= test__mulsf3(0x20d5b211, 0x1ebf7ef1, 0x004fece9);
+  status |= test__mulsf3(0x3fadfbc4, 0x3f400000, 0x3f827cd3);
+  status |= test__mulsf3(0x6fadfbc4, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20adfbc4, 0x1ec00000, 0x00413e6a);
+  status |= test__mulsf3(0x3fd0ef03, 0x3f800000, 0x3fd0ef03);
+  status |= test__mulsf3(0x6fd0ef03, 0x4f800000, 0x7f800000);
+  status |= test__mulsf3(0x20d0ef03, 0x1f000000, 0x00687782);
+  status |= test__mulsf3(0x3fd0ef03, 0x3f8673ab, 0x3fdb7705);
+  status |= test__mulsf3(0x6fd0ef03, 0x4f8673ab, 0x7f800000);
+  status |= test__mulsf3(0x20d0ef03, 0x1f0673ab, 0x006dbb83);
+  status |= test__mulsf3(0x3fd0ef03, 0x3f798c55, 0x3fcbab02);
+  status |= test__mulsf3(0x6fd0ef03, 0x4f798c55, 0x7f800000);
+  status |= test__mulsf3(0x20d0ef03, 0x1ef98c55, 0x0065d581);
+  status |= test__mulsf3(0x3fdd1181, 0x3f8ad17f, 0x3fefc0b1);
+  status |= test__mulsf3(0x6fdd1181, 0x4f8ad17f, 0x7f800000);
+  status |= test__mulsf3(0x20dd1181, 0x1f0ad17f, 0x0077e058);
+  status |= test__mulsf3(0x3fdd1181, 0x3f752e81, 0x3fd3b9e9);
+  status |= test__mulsf3(0x6fdd1181, 0x4f752e81, 0x7f800000);
+  status |= test__mulsf3(0x20dd1181, 0x1ef52e81, 0x0069dcf5);
+  status |= test__mulsf3(0x3f92efc6, 0x3fa00000, 0x3fb7abb8);
+  status |= test__mulsf3(0x6f92efc6, 0x4fa00000, 0x7f800000);
+  status |= test__mulsf3(0x2092efc6, 0x1f200000, 0x005bd5dc);
+  status |= test__mulsf3(0x3fdcefe6, 0x3f400000, 0x3fa5b3ec);
+  status |= test__mulsf3(0x6fdcefe6, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20dcefe6, 0x1ec00000, 0x0052d9f6);
+  status |= test__mulsf3(0x3fad6507, 0x3fa2f8b7, 0x3fdcc4c9);
+  status |= test__mulsf3(0x6fad6507, 0x4fa2f8b7, 0x7f800000);
+  status |= test__mulsf3(0x20ad6507, 0x1f22f8b7, 0x006e6264);
+  status |= test__mulsf3(0x3fad6507, 0x3f62f8b7, 0x3f99bba6);
+  status |= test__mulsf3(0x6fad6507, 0x4f62f8b7, 0x7f800000);
+  status |= test__mulsf3(0x20ad6507, 0x1ee2f8b7, 0x004cddd3);
+  status |= test__mulsf3(0x3fbfde6b, 0x3f8721bd, 0x3fca8f27);
+  status |= test__mulsf3(0x6fbfde6b, 0x4f8721bd, 0x7f800000);
+  status |= test__mulsf3(0x20bfde6b, 0x1f0721bd, 0x00654794);
+  status |= test__mulsf3(0x3fbfde6b, 0x3f4721bd, 0x3f953f2e);
+  status |= test__mulsf3(0x6fbfde6b, 0x4f4721bd, 0x7f800000);
+  status |= test__mulsf3(0x20bfde6b, 0x1ec721bd, 0x004a9f97);
+  status |= test__mulsf3(0x3ff40db4, 0x3f400000, 0x3fb70a47);
+  status |= test__mulsf3(0x6ff40db4, 0x4f400000, 0x7f800000);
+  status |= test__mulsf3(0x20f40db4, 0x1ec00000, 0x005b8524);
+  status |= test__mulsf3(0x3ff40db4, 0x3f600000, 0x3fd58bfe);
+  status |= test__mulsf3(0x6ff40db4, 0x4f600000, 0x7f800000);
+  status |= test__mulsf3(0x20f40db4, 0x1ee00000, 0x006ac5ff);
+  status |= test__mulsf3(0x3f9e20d3, 0x3f90c8a5, 0x3fb2dccc);
+  status |= test__mulsf3(0x6f9e20d3, 0x4f90c8a5, 0x7f800000);
+  status |= test__mulsf3(0x209e20d3, 0x1f10c8a5, 0x00596e66);
+  status |= test__mulsf3(0x3f9e20d3, 0x3fc00000, 0x3fed313c);
+  status |= test__mulsf3(0x6f9e20d3, 0x4fc00000, 0x7f800000);
+  status |= test__mulsf3(0x209e20d3, 0x1f400000, 0x0076989e);
+  status |= test__mulsf3(0x3f9e20d3, 0x3f50c8a5, 0x3f80f69b);
+  status |= test__mulsf3(0x6f9e20d3, 0x4f50c8a5, 0x7f800000);
+  status |= test__mulsf3(0x209e20d3, 0x1ed0c8a5, 0x00407b4d);
+  status |= test__mulsf3(0x3f82e641, 0x3f8fd63f, 0x3f931856);
+  status |= test__mulsf3(0x6f82e641, 0x4f8fd63f, 0x7f800000);
+  status |= test__mulsf3(0x2082e641, 0x1f0fd63f, 0x00498c2b);
+  status |= test__mulsf3(0x3f9a1901, 0x3f96e701, 0x3fb5ab68);
+  status |= test__mulsf3(0x6f9a1901, 0x4f96e701, 0x7f800000);
+  status |= test__mulsf3(0x209a1901, 0x1f16e701, 0x005ad5b4);
+  status |= test__mulsf3(0x3fa21aa1, 0x3f7c4961, 0x3f9fc0ae);
+  status |= test__mulsf3(0x6fa21aa1, 0x4f7c4961, 0x7f800000);
+  status |= test__mulsf3(0x20a21aa1, 0x1efc4961, 0x004fe057);
+  status |= test__mulsf3(0x3fcd0767, 0x3f782457, 0x3fc6bc47);
+  status |= test__mulsf3(0x6fcd0767, 0x4f782457, 0x7f800000);
+  status |= test__mulsf3(0x20cd0767, 0x1ef82457, 0x00635e23);
+  status |= test__mulsf3(0x3fb875e1, 0x3f968e21, 0x3fd8f6f6);
+  status |= test__mulsf3(0x6fb875e1, 0x4f968e21, 0x7f800000);
+  status |= test__mulsf3(0x20b875e1, 0x1f168e21, 0x006c7b7b);
+  status |= test__mulsf3(0x3fc2f0d7, 0x3f5efd19, 0x3fa9cd95);
+  status |= test__mulsf3(0x6fc2f0d7, 0x4f5efd19, 0x7f800000);
+  status |= test__mulsf3(0x20c2f0d7, 0x1edefd19, 0x0054e6cb);
+  status |= test__mulsf3(0x7f7ffffe, 0x3f800001, 0x7f800000);
+  status |= test__mulsf3(0x00000003, 0xc00fffff, 0x80000007);
+  status |= test__mulsf3(0x00000003, 0x400fffff, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0xc00fffff, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0x400fffff, 0x80000007);
+  status |= test__mulsf3(0x00000003, 0xc00ffffd, 0x80000007);
+  status |= test__mulsf3(0x00000003, 0x400ffffd, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0xc00ffffd, 0x00000007);
+  status |= test__mulsf3(0x80000003, 0x400ffffd, 0x80000007);
+  status |= test__mulsf3(0x3e00007f, 0x017c0000, 0x003f003f);
+  status |= test__mulsf3(0xcf7fff00, 0xc0ffff00, 0x50fffe00);
+  status |= test__mulsf3(0x3fdf7f00, 0x3fffff00, 0x405f7e21);
+  status |= test__mulsf3(0x19b92144, 0x1a310000, 0x00000001);
+  status |= test__mulsf3(0x19ffc008, 0x1a002004, 0x00000001);
+  status |= test__mulsf3(0x7f7ffff0, 0xc0000008, 0xff800000);
+
+#if __thumb__ && !__thumb2__
+  // These tests depend on Arm-specific IEEE 754 implementation choices
+  // regarding NaNs, which are satisfied by arm/mulsf3.S but not guaranteed by
+  // other implementations:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand.
+
+  status |= test__mulsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+  status |= test__mulsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+  status |= test__mulsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+  status |= test__mulsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+  status |= test__mulsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+  status |= test__mulsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+  status |= test__mulsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+  status |= test__mulsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+  status |= test__mulsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+  status |= test__mulsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+  status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000);
+  status |= test__mulsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+  status |= test__mulsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+  status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000);
+  status |= test__mulsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+  status |= test__mulsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+  status |= test__mulsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+  status |= test__mulsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+  status |= test__mulsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+  status |= test__mulsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+  status |= test__mulsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+  status |= test__mulsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+  status |= test__mulsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+  status |= test__mulsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+  status |= test__mulsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+  status |= test__mulsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+  status |= test__mulsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+  status |= test__mulsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+  status |= test__mulsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+  status |= test__mulsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+  status |= test__mulsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+  status |= test__mulsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+  status |= test__mulsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+  status |= test__mulsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+  status |= test__mulsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+  status |= test__mulsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+  status |= test__mulsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+  status |= test__mulsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+  status |= test__mulsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+  status |= test__mulsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+  status |= test__mulsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+  status |= test__mulsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+  status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000);
+  status |= test__mulsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+  status |= test__mulsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+  status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000);
+  status |= test__mulsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+  status |= test__mulsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+  status |= test__mulsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+  status |= test__mulsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+  status |= test__mulsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+  status |= test__mulsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+  status |= test__mulsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+  status |= test__mulsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+  status |= test__mulsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+  status |= test__mulsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+#endif // __arm__
+
+  return status;
+}

From 8c7228fca9c7a5ede35783b634e25762389bca8d Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 2 Oct 2025 10:02:36 +0100
Subject: [PATCH 2/8] Fix the Thumb1 build which I forgot to test

---
 compiler-rt/lib/builtins/CMakeLists.txt      |  3 +++
 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S | 14 +++++++-------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index d7a0a139c5ad6..5c577c19f25a3 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -474,6 +474,9 @@ set(thumb1_base_SOURCES
 if(COMPILER_RT_ARM_OPTIMIZED_FP)
   set(thumb1_base_SOURCES
     arm/thumb1/mulsf3.S
+    arm/fnan2.c
+    arm/fnorm2.c
+    arm/funder.c
     ${thumb1_base_SOURCES}
   )
 endif()
diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
index 62a6d71011003..f1745a037e696 100644
--- a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
+++ b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
@@ -145,7 +145,7 @@ LOCAL_LABEL(outflow):
   // To handle underflow, we construct an intermediate value in the IEEE 754
   // style (using our existing full-length mantissa, and bias the exponent by
   // +0xC0), and indicate whether that intermediate was rounded up, down or not
-  // at all. Then call the helper function __funder, which will denormalise and
+  // at all. Then call the helper function funder, which will denormalise and
   // re-round correctly.
   LSLS    r1, r0, #7    // shift up the post-rounding mantissa
   SUBS    r1, r3, r1    //   and subtract it from the pre-rounding version
@@ -159,7 +159,7 @@ LOCAL_LABEL(outflow):
   LSLS    r3, r2, #23
   ADDS    r0, r0, r3    // put on the biased exponent
 
-  BL      __funder
+  BL      SYMBOL_NAME(__compiler_rt_funder)
   POP     {r4,r5,r6,pc}
 
 LOCAL_LABEL(overflow):
@@ -196,15 +196,15 @@ LOCAL_LABEL(zero):
   LSLS    r0, r4, #31    // shift up the output sign to make the return value
   POP     {r4,r5,r6,pc}
 
-  // Handle denormals via the helper function __fnorm2, which will break both
+  // Handle denormals via the helper function fnorm2, which will break both
   // inputs up into mantissa and exponent, renormalising and generating a
   // negative exponent if necessary.
 LOCAL_LABEL(denorm):
   PUSH    {r0,r1,r2,r3}
   MOV     r0, sp
-  BL      __fnorm2
+  BL      SYMBOL_NAME(__compiler_rt_fnorm2)
   POP     {r0,r1,r2,r3}
-  // Convert __fnorm2's return values into the right form to rejoin the main
+  // Convert fnorm2's return values into the right form to rejoin the main
   // code path.
   LSLS    r0, r0, #1
   LSLS    r1, r1, #1
@@ -234,11 +234,11 @@ LOCAL_LABEL(naninf):
   // an appropriately signed infinity.
   B       LOCAL_LABEL(overflow)         // reuse the code there
 
-  // We come here if at least one input is a NaN. Hand off to __fnan2, which
+  // We come here if at least one input is a NaN. Hand off to fnan2, which
   // propagates an appropriate NaN to the output, dealing with the special
   // cases of signalling/quiet NaNs.
 LOCAL_LABEL(nan):
-  BL      __fnan2
+  BL      SYMBOL_NAME(__compiler_rt_fnan2)
   POP     {r4,r5,r6,pc}
 
   // Return a quiet NaN as the result of infinity * zero.

From 4edb28bc3d90232498e3c355d96b4822c3a1b26b Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 2 Oct 2025 10:50:26 +0100
Subject: [PATCH 3/8] Use DEFINE_COMPILERRT_THUMB_FUNCTION in Thumb1

---
 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
index f1745a037e696..dc6b1f59adbec 100644
--- a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
+++ b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
@@ -21,7 +21,7 @@
 
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
 
-DEFINE_COMPILERRT_FUNCTION(__mulsf3)
+DEFINE_COMPILERRT_THUMB_FUNCTION(__mulsf3)
   PUSH {r4,r5,r6,lr}
 
   // Get exponents of the inputs, and check for uncommon values. In the process

From c73dfeaacafd5a641d9d4cf0efb551284a0638c4 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 2 Oct 2025 10:50:38 +0100
Subject: [PATCH 4/8] Tweak final return in fnan2 as suggested

---
 compiler-rt/lib/builtins/arm/fnan2.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c
index 0563fe1fde38f..c2fbfa3974d6e 100644
--- a/compiler-rt/lib/builtins/arm/fnan2.c
+++ b/compiler-rt/lib/builtins/arm/fnan2.c
@@ -33,6 +33,5 @@ uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) {
     return b | 0x00400000; //   if so, return it with the quiet bit set
   if (aadj < 0x00800000)   // a is a quiet NaN?
     return a;              // if so, return it
-  else                     // expect (badj < 0x00800000)
-    return b;              // in that case b must be a quiet NaN
+  return b;                // otherwise we expect b must be a quiet NaN
 }

From b23637240387ae983f0cad789861a4363ac7d13c Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 2 Oct 2025 10:52:40 +0100
Subject: [PATCH 5/8] Clarify comment about 4-byte boundary

---
 compiler-rt/lib/builtins/arm/divsf3.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/arm/divsf3.S b/compiler-rt/lib/builtins/arm/divsf3.S
index bf07217cba907..d529c7da911a3 100644
--- a/compiler-rt/lib/builtins/arm/divsf3.S
+++ b/compiler-rt/lib/builtins/arm/divsf3.S
@@ -536,7 +536,7 @@ for prefix in range(64, 128):
 
 */
 
-  .p2align 2  // make sure we start on a 32-bit boundary, even in Thumb
+  .p2align 2  // make sure we start on a 4-byte boundary, even in Thumb
 LOCAL_LABEL(tab):
   .byte 0xfe  // input [0x800000,0x81ffff], candidate outputs [0xfd,0xff]
   .byte 0xfa  // input [0x820000,0x83ffff], candidate outputs [0xf9,0xfc]

From 7a245355470383c1907f8a570e445e072e035dde Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 2 Oct 2025 11:22:02 +0100
Subject: [PATCH 6/8] Lowercase instruction mnemonics and shifter operands

This was Petr Hosek's comment on #154093, but if we're doing that, we
should do it consistently.
---
 compiler-rt/lib/builtins/arm/divsf3.S        | 216 +++++++++---------
 compiler-rt/lib/builtins/arm/mulsf3.S        | 150 ++++++-------
 compiler-rt/lib/builtins/arm/thumb1/mulsf3.S | 224 +++++++++----------
 3 files changed, 295 insertions(+), 295 deletions(-)

diff --git a/compiler-rt/lib/builtins/arm/divsf3.S b/compiler-rt/lib/builtins/arm/divsf3.S
index d529c7da911a3..2f37234457b7b 100644
--- a/compiler-rt/lib/builtins/arm/divsf3.S
+++ b/compiler-rt/lib/builtins/arm/divsf3.S
@@ -33,17 +33,17 @@ DEFINE_COMPILERRT_FUNCTION(__divsf3)
   // will have to redo it just in case. That saves an instruction here,
   // executed for _all_ inputs, and moves it to the uncommon path run for only
   // some inputs.
-  MOV     r12, #0xFF0000
-  ANDS    r2, r12, r0, LSR #7   // r2 has exponent of numerator. (Is it 0?)
-  ANDSNE  r3, r12, r1, LSR #7   // r3 has exponent of denominator. (Is it 0?)
-  TEQNE   r2, r12               // if neither was 0, is one FF?
-  TEQNE   r3, r12               // or the other?
-  BEQ     LOCAL_LABEL(uncommon)         // branch out of line if any answer was yes
+  mov     r12, #0xFF0000
+  ands    r2, r12, r0, lsr #7   // r2 has exponent of numerator. (Is it 0?)
+  andsne  r3, r12, r1, lsr #7   // r3 has exponent of denominator. (Is it 0?)
+  teqne   r2, r12               // if neither was 0, is one FF?
+  teqne   r3, r12               // or the other?
+  beq     LOCAL_LABEL(uncommon)         // branch out of line if any answer was yes
 
   // Calculate the output sign, which is always just the XOR of the input
   // signs. Store it in bit 8 of r2, below the numerator exponent.
-  TEQ     r0, r1                // is the output sign bit 1?
-  ORRMI   r2, r2, #0x100        // if so, set bit 8 of r2
+  teq     r0, r1                // is the output sign bit 1?
+  orrmi   r2, r2, #0x100        // if so, set bit 8 of r2
 
   // Isolate the mantissas of both values, by setting bit 23 of each one and
   // clearing the 8 bits above that.
@@ -52,10 +52,10 @@ DEFINE_COMPILERRT_FUNCTION(__divsf3)
   // instructions if we do it as part of this manipulation). We want the
   // numerator not to be in r0, because r0 is where we'll build up the quotient
   // while subtracting things from the numerator.
-  ORR     r12, r0, #1 << 23
-  ORR     r0, r1, #1 << 23
-  BIC     r1, r12, #0xFF000000
-  BIC     r0, r0, #0xFF000000
+  orr     r12, r0, #1 << 23
+  orr     r0, r1, #1 << 23
+  bic     r1, r12, #0xFF000000
+  bic     r0, r0, #0xFF000000
 
 LOCAL_LABEL(div):
   // Start of the main division. We get here knowing that:
@@ -65,11 +65,11 @@ LOCAL_LABEL(div):
   //   r2 = (exponent of numerator << 16) + (result sign << 8)
   //   r3 = (exponent of denominator << 16)
 
-  PUSH    {r14}                 // we'll need an extra register
+  push    {r14}                 // we'll need an extra register
 
   // Calculate the initial result exponent by just subtracting the two input
   // exponents. This doesn't affect the sign bit lower down in r2.
-  SUB     r2, r2, r3
+  sub     r2, r2, r3
 
   // That initial exponent might need to be adjusted by 1, depending on whether
   // dividing the mantissas gives a value >=1 or <1. We don't need to wait
@@ -78,7 +78,7 @@ LOCAL_LABEL(div):
   //
   // The basic idea is to do the comparison in a way that sets the C flag if
   // numerator >= denominator. Then we recombine the sign and exponent by doing
-  // "ADC r2, r2, r2, ASR #16": the exponent in the top half of r2 is shifted
+  // "ADC r2, r2, r2, asr #16": the exponent in the top half of r2 is shifted
   // down to the low 8 bits, just below the sign bit, and using ADC rather than
   // ADD folds in the conditional increment from the mantissa comparison.
   //
@@ -98,9 +98,9 @@ LOCAL_LABEL(div):
   // Putting all of that together, what we _want_ to do is this:
   //
   // [#1]   CMP     r1, r0                // set C if num >= den
-  // [#2]   MOVLO   r1, r1, LSL #1        // if num < den, shift num left
+  // [#2]   MOVLO   r1, r1, lsl #1        // if num < den, shift num left
   // [#3]   ADD     r2, r2, #0x7D0000     // rebias exponent
-  // [#4]   ADC     r2, r2, r2, ASR #16   // combine sign + exp + adjustment
+  // [#4]   ADC     r2, r2, r2, asr #16   // combine sign + exp + adjustment
   //
   // However, we only do the first of those four instructions right here. The
   // other three are distributed through the code below, after unrelated load
@@ -109,7 +109,7 @@ LOCAL_LABEL(div):
   //
   // (Since instruction #4 depends on the flags set up by #2, we must avoid
   // clobbering the flags in _any_ of the instructions interleaved with this!)
-  CMP     r1, r0                // exponent setup [#1]
+  cmp     r1, r0                // exponent setup [#1]
 
   // Start the mantissa division by making an approximation to the reciprocal
   // of the denominator. We first obtain an 8-bit approximation using a table
@@ -119,15 +119,15 @@ LOCAL_LABEL(div):
   // (r0 >> 17) is the table index, and its top bit is always set, so it ranges
   // from 64 to 127 inclusive. So we point the base register 64 bytes before
   // the actual table.
-  ADR     r12, LOCAL_LABEL(tab) - 64
+  adr     r12, LOCAL_LABEL(tab) - 64
 #if __thumb__
   // Thumb can't do this particular shift+add+load in one instruction - it only
   // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must
   // calculate the load offset separately.
-  ADD     r14, r12, r0, LSR #17
-  LDRB    r14, [r14]
+  add     r14, r12, r0, lsr #17
+  ldrb    r14, [r14]
 #else
-  LDRB    r14, [r12, r0, LSR #17]
+  ldrb    r14, [r12, r0, lsr #17]
 #endif
 
   // Now do an iteration of Newton-Raphson to improve that 8-bit approximation
@@ -157,12 +157,12 @@ LOCAL_LABEL(div):
   // <2^24 with something <2^8; but we must shift it right before multiplying
   // by x again.
 
-  MUL     r12, r0, r14          // r12  = dx
-  MOVLO   r1, r1, LSL #1        //   exponent setup [#2] in the MUL delay slot
-  MVN     r12, r12, LSR #8      // r12 ~= -dx/2^8
-  MUL     r3, r12, r14          // r3  ~= -dx^2/2^8
-  MOV     r14, r14, LSL #9      // r14  = 2^9 x
-  ADD     r14, r14, r3, ASR #15 // r14 ~= 2^9 x - dx^2 / 2^23
+  mul     r12, r0, r14          // r12  = dx
+  movlo   r1, r1, lsl #1        //   exponent setup [#2] in the MUL delay slot
+  mvn     r12, r12, lsr #8      // r12 ~= -dx/2^8
+  mul     r3, r12, r14          // r3  ~= -dx^2/2^8
+  mov     r14, r14, lsl #9      // r14  = 2^9 x
+  add     r14, r14, r3, asr #15 // r14 ~= 2^9 x - dx^2 / 2^23
 
   // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa,
   // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16
@@ -189,13 +189,13 @@ LOCAL_LABEL(div):
   // First iteration of long division. We shift the numerator left 11 bits, and
   // since the quotient approximation is scaled by 2^31, we must shift that
   // right by 20 to make the right product to subtract from the numerator.
-  MOV     r12, r1, LSR #8       // shift the numerator down
-  MUL     r12, r14, r12         // make the quotient approximation
-  MOV     r1, r1, LSL #11       // shift numerator left, ready for subtraction
-  MOV     r3, r12, LSR #20      // make first 12-bit block of quotient bits
-  MLS     r1, r0, r3, r1        // subtract that multiple of den from num
+  mov     r12, r1, lsr #8       // shift the numerator down
+  mul     r12, r14, r12         // make the quotient approximation
+  mov     r1, r1, lsl #11       // shift numerator left, ready for subtraction
+  mov     r3, r12, lsr #20      // make first 12-bit block of quotient bits
+  mls     r1, r0, r3, r1        // subtract that multiple of den from num
 
-  ADD     r2, r2, #0x7D0000     //   exponent setup [#3] in the MLS delay slot
+  add     r2, r2, #0x7D0000     //   exponent setup [#3] in the MLS delay slot
 
   // Second iteration of long division. Differences from the first step: this
   // time we shift the numerator 12 bits instead of 11, so that the total of
@@ -203,13 +203,13 @@ LOCAL_LABEL(div):
   // the output mantissa. Also, the block of output quotient bits is left in a
   // different register: it was in r3 the first time, and this time it's in
   // r12, so that we still have both available at the end of the process.
-  MOV     r12, r1, LSR #8       // shift the numerator down
-  MUL     r12, r14, r12         // make the quotient approximation
-  MOV     r1, r1, LSL #12       // shift numerator left, ready for subtraction
-  MOV     r12, r12, LSR #19     // make second 11-bit block of quotient
-  MLS     r1, r0, r12, r1       // subtract that multiple of den from num
+  mov     r12, r1, lsr #8       // shift the numerator down
+  mul     r12, r14, r12         // make the quotient approximation
+  mov     r1, r1, lsl #12       // shift numerator left, ready for subtraction
+  mov     r12, r12, lsr #19     // make second 11-bit block of quotient
+  mls     r1, r0, r12, r1       // subtract that multiple of den from num
 
-  ADC     r2, r2, r2, ASR #16   //   exponent setup [#4] in the MLS delay slot
+  adc     r2, r2, r2, asr #16   //   exponent setup [#4] in the MLS delay slot
 
   // Now r1 contains the original numerator, shifted left 23, minus _some_
   // multiple of the original denominator (which is still in r0). The bounds on
@@ -222,12 +222,12 @@ LOCAL_LABEL(div):
   // recombines them, by doing the comparison in such a way that it sets the
   // carry flag if the increment is needed.
 
-  CMP     r1, r0                // Set carry flag if num >= den
-  SUBHS   r1, r1, r0            // If so, subtract den from num
-  ADC     r3, r12, r3, LSL #12  // Recombine quotient halves, plus optional +1
+  cmp     r1, r0                // Set carry flag if num >= den
+  subhs   r1, r1, r0            // If so, subtract den from num
+  adc     r3, r12, r3, lsl #12  // Recombine quotient halves, plus optional +1
 
   // We've finished with r14 as a temporary register, so we can unstack it now.
-  POP     {r14}
+  pop     {r14}
 
   // Now r3 contains the _rounded-down_ output quotient, and r1 contains the
   // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and
@@ -259,16 +259,16 @@ LOCAL_LABEL(div):
   // course in integers it's easier to compare 2*remainder with denominator. So
   // we start by shifting the remainder left by 1, and in the process, set Z if
   // it's exactly 0 (i.e. the result needs no rounding at all).
-  LSLS    r1, r1, #1
+  lsls    r1, r1, #1
   // Now trial-subtract the denominator. We don't do this at all if the result
   // was exact. If we do do it, r1 goes negative precisely if we need to round
   // up, which sets the C flag. (The previous instruction will have left C
   // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if
   // we're rounding up.)
-  SUBSNE  r1, r1, r0
+  subsne  r1, r1, r0
   // Recombine the quotient with the sign + exponent, and use the C flag from
   // the previous instruction to increment the quotient if we're rounding up.
-  ADC     r0, r3, r2, LSL #23
+  adc     r0, r3, r2, lsl #23
 
   // If we haven't either overflowed or underflowed, we're done. We can
   // identify most of the safe cases by doing an unsigned comparison of the
@@ -290,14 +290,14 @@ LOCAL_LABEL(div):
   // further); if the exponent was less than that then r2 wraps round and looks
   // like a very large positive integer from the point of view of this unsigned
   // comparison.
-  CMP     r2, #0xFC0000
-  BXLO    lr
+  cmp     r2, #0xFC0000
+  bxlo    lr
 
   // The same comparison will have set the N and V flags to reflect the result
   // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably
   // distinguishes potential underflow (r2 is negative) from potential overflow
   // (r2 is positive and at least 0xFC0000)
-  BGE     LOCAL_LABEL(overflow)
+  bge     LOCAL_LABEL(overflow)
 
   // Here we might or might not have underflow (but we know we don't have
   // overflow). To check more carefully, we look at the _bottom_ half of r2,
@@ -322,17 +322,17 @@ LOCAL_LABEL(div):
   // 0x60 or greater for a (pre-rounding) underflow, and zero for a
   // non-underflow.
 
-  TST     r2, #0xF0
-  BXEQ    lr                    // no underflow after all; return
+  tst     r2, #0xF0
+  bxeq    lr                    // no underflow after all; return
 
   // Rebias the exponent for funder, which also corrects the sign bit.
-  ADD     r0, r0, #192 << 23
+  add     r0, r0, #192 << 23
   // Tell funder whether the true value is greater or less than the number in
   // r0. This is obtained from the sign of the remainder (still in r1), with
   // the only problem being that it's currently reversed. So negate r1 (leaving
   // 0 at 0 to indicate exactness).
-  RSBS    r1, r1, #0
-  B     SYMBOL_NAME(__compiler_rt_funder)
+  rsbs    r1, r1, #0
+  b     SYMBOL_NAME(__compiler_rt_funder)
 
 LOCAL_LABEL(overflow):
   // Here we might or might not have overflow (but we know we don't have
@@ -346,18 +346,18 @@ LOCAL_LABEL(overflow):
   // (moving the exponent field to the top), increment it once more (so that
   // the smallest overflowed exponent 0xFF wraps round to 0), and then compare
   // against 0xFE000000 as an unsigned integer.
-  MOV     r12, r0, LSL #1
-  ADD     r12, r12, #1 << 24
-  CMP     r12, #0xFE << 24      // Check for exp = 253 or 254
-  BXHS    lr
+  mov     r12, r0, lsl #1
+  add     r12, r12, #1 << 24
+  cmp     r12, #0xFE << 24      // Check for exp = 253 or 254
+  bxhs    lr
   // We have actual overflow. Rebias r0 to bring the exponent back into range,
   // which ensures its sign is correct. Then make an infinity of that sign to
   // return.
-  SUBS    r0, r0, #0xC0 << 23
-  MOVS    r12, #0xFF            // exponent of infinity
-  ORRS    r12, r12, r0, LSR #23 // exponent and sign at bottom of r12
-  MOVS    r0, r12, LSL #23      // shift it up to the top of r0 to return
-  BX      lr
+  subs    r0, r0, #0xC0 << 23
+  movs    r12, #0xFF            // exponent of infinity
+  orrs    r12, r12, r0, lsr #23 // exponent and sign at bottom of r12
+  movs    r0, r12, lsl #23      // shift it up to the top of r0 to return
+  bx      lr
 
 LOCAL_LABEL(uncommon):
   // We come here from the start of the function if either input is an uncommon
@@ -367,57 +367,57 @@ LOCAL_LABEL(uncommon):
   // in bits 16..23. But r3 doesn't necessarily contain the exponent of y,
   // because the instruction that set it up was conditional. So first we
   // unconditionally repeat it.
-  AND     r3, r12, r1, LSR #7
+  and     r3, r12, r1, lsr #7
 
   // In all cases not involving a NaN as output, the sign of the output is made
   // in the same way as for finite numbers, as the XOR of the input signs. So
   // repeat the sign setup from the main branch.
-  TEQ     r0, r1                // is the output sign bit 1?
-  ORRMI   r2, r2, #0x100        // if so, set bit 8 of r2
+  teq     r0, r1                // is the output sign bit 1?
+  orrmi   r2, r2, #0x100        // if so, set bit 8 of r2
 
   // Detect infinities and NaNs, by checking if either of r2 or r3 is at least
   // 0xFF0000.
-  CMP     r2, #0xFF0000
-  CMPLO   r3, #0xFF0000
-  BHS     LOCAL_LABEL(inf_NaN)
+  cmp     r2, #0xFF0000
+  cmplo   r3, #0xFF0000
+  bhs     LOCAL_LABEL(inf_NaN)
 
   // Now we know there are no infinities or NaNs, but there's at least one zero
   // or denormal.
-  MOVS    r12, r1, LSL #1       // is y zero?
-  BEQ     LOCAL_LABEL(divbyzero)        // if so, go and handle division by zero
-  MOVS    r12, r0, LSL #1       // is x zero? (now we know that y is not)
-  MOVEQ   r0, r2, LSL #23       // if so, 0/nonzero is just 0 (of right sign)
-  BXEQ    lr
+  movs    r12, r1, lsl #1       // is y zero?
+  beq     LOCAL_LABEL(divbyzero)        // if so, go and handle division by zero
+  movs    r12, r0, lsl #1       // is x zero? (now we know that y is not)
+  moveq   r0, r2, lsl #23       // if so, 0/nonzero is just 0 (of right sign)
+  bxeq    lr
 
   // Now we've eliminated zeroes as well, leaving only denormals: either x or
   // y, or both, is a denormal. Call fnorm2 to convert both into a normalised
   // mantissa and a (potentially small) exponent.
-  AND     r12, r2, #0x100       // save the result sign from r2
-  LSR     r2, #16               // shift extracted exponents down to bit 0
-  LSR     r3, #16               // where fnorm2 will expect them
-  PUSH    {r0, r1, r2, r3, r12, lr}
-  MOV     r0, sp                // tell fnorm2 where to find its data
-  BL      SYMBOL_NAME(__compiler_rt_fnorm2)
-  POP     {r0, r1, r2, r3, r12, lr}
-  LSL     r3, #16               // shift exponents back up to bit 16
-  ORR     r2, r12, r2, LSL #16  // and put the result sign back in r2
+  and     r12, r2, #0x100       // save the result sign from r2
+  lsr     r2, #16               // shift extracted exponents down to bit 0
+  lsr     r3, #16               // where fnorm2 will expect them
+  push    {r0, r1, r2, r3, r12, lr}
+  mov     r0, sp                // tell fnorm2 where to find its data
+  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
+  pop     {r0, r1, r2, r3, r12, lr}
+  lsl     r3, #16               // shift exponents back up to bit 16
+  orr     r2, r12, r2, lsl #16  // and put the result sign back in r2
 
   // Now rejoin the main code path, having finished the setup it will expect:
   // swap x and y, and shift the fractions back down to the low 24 bits.
-  MOV     r12, r0, LSR #8
-  MOV     r0, r1, LSR #8
-  MOV     r1, r12
-  B       LOCAL_LABEL(div)
+  mov     r12, r0, lsr #8
+  mov     r0, r1, lsr #8
+  mov     r1, r12
+  b       LOCAL_LABEL(div)
 
 LOCAL_LABEL(inf_NaN):
   // We come here if at least one input is a NaN or infinity. If either or both
   // inputs are NaN then we hand off to fnan2 to propagate a NaN from the
   // input.
-  MOV     r12, #0xFF000000
-  CMP     r12, r0, LSL #1       // if (r0 << 1) > 0xFF000000, r0 is a NaN
-  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
-  CMP     r12, r1, LSL #1
-  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
+  mov     r12, #0xFF000000
+  cmp     r12, r0, lsl #1       // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+  cmp     r12, r1, lsl #1
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
 
   // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and
   // finite/inf = 0.
@@ -425,26 +425,26 @@ LOCAL_LABEL(inf_NaN):
   // If both operands are infinity, we return a NaN. Since we know at
   // least _one_ is infinity, we can test this by checking if they're
   // equal apart from the sign bits.
-  EOR     r3, r0, r1
-  LSLS    r3, #1                // were all bits of XOR zero other than top?
-  BEQ     LOCAL_LABEL(invalid)          // if so, both operands are infinity
+  eor     r3, r0, r1
+  lsls    r3, #1                // were all bits of XOR zero other than top?
+  beq     LOCAL_LABEL(invalid)          // if so, both operands are infinity
 
   // See if x is infinite
-  CMP     r12, r0, LSL #1       // (r0 << 1) == 0xFF000000?
-  BEQ     LOCAL_LABEL(infret)           // if so, infinity/finite = infinity
+  cmp     r12, r0, lsl #1       // (r0 << 1) == 0xFF000000?
+  beq     LOCAL_LABEL(infret)           // if so, infinity/finite = infinity
 
   // y is infinite and x is not, so we return a zero of the
   // combined sign.
-  EOR     r0, r0, r1            // calculate the right sign
-  AND     r0, r0, #0x80000000   // throw away everything else
-  BX      lr
+  eor     r0, r0, r1            // calculate the right sign
+  and     r0, r0, #0x80000000   // throw away everything else
+  bx      lr
 
 LOCAL_LABEL(divbyzero):
   // Here, we know y is zero. But we don't know if x is zero or nonzero. So we
   // might be calculating 0/0 (invalid operation, generating a NaN), or
   // nonzero/0 (the IEEE "division by zero" exception, generating infinity).
-  MOVS    r12, r0, LSL #1       // is x zero too?
-  BEQ     LOCAL_LABEL(invalid)          // if so, go and return a NaN
+  movs    r12, r0, lsl #1       // is x zero too?
+  beq     LOCAL_LABEL(invalid)          // if so, go and return a NaN
 
 LOCAL_LABEL(infret):
   // Here, we're either dividing infinity by a finite number, or dividing a
@@ -460,15 +460,15 @@ LOCAL_LABEL(infret):
   // r2 contains the output sign in bit 8, which is a convenient place to find
   // it when making an infinity, because we can fill in the 8 exponent bits
   // below that and then shift it left.
-  ORR     r2, r2, #0xff         // sign + maximum exponent
-  LSL     r0, r2, #23           // shift up to the top
-  BX      lr
+  orr     r2, r2, #0xff         // sign + maximum exponent
+  lsl     r0, r2, #23           // shift up to the top
+  bx      lr
 
 LOCAL_LABEL(invalid):
   // Return the default NaN, from an invalid operation (either dividing
   // infinity by infinity, or 0 by 0).
-  LDR     r0, =0x7FC00000
-  BX      lr
+  ldr     r0, =0x7FC00000
+  bx      lr
 
 // Finally, the lookup table for the initial reciprocal approximation.
 //
diff --git a/compiler-rt/lib/builtins/arm/mulsf3.S b/compiler-rt/lib/builtins/arm/mulsf3.S
index dc1843615313a..b4f4c5e958c52 100644
--- a/compiler-rt/lib/builtins/arm/mulsf3.S
+++ b/compiler-rt/lib/builtins/arm/mulsf3.S
@@ -28,22 +28,22 @@ DEFINE_COMPILERRT_FUNCTION(__mulsf3)
   // also extracted the exponents of the input values r0/r1 into bits 16..23 of
   // r2/r3. But if we do, then that hasn't necessarily been done (because the
   // second AND might have been skipped).
-  MOV     r12, #0xFF0000
-  ANDS    r2, r12, r0, LSR #7  // sets Z if exponent of x is 0
-  ANDSNE  r3, r12, r1, LSR #7  // otherwise, sets Z if exponent of y is 0
-  TEQNE   r2, r12              // otherwise, sets Z if exponent of x is FF
-  TEQNE   r3, r12              // otherwise, sets Z if exponent of y is FF
-  BEQ     LOCAL_LABEL(uncommon)        // branch out of line to handle inf/NaN/0/denorm
+  mov     r12, #0xFF0000
+  ands    r2, r12, r0, lsr #7  // sets Z if exponent of x is 0
+  andsne  r3, r12, r1, lsr #7  // otherwise, sets Z if exponent of y is 0
+  teqne   r2, r12              // otherwise, sets Z if exponent of x is FF
+  teqne   r3, r12              // otherwise, sets Z if exponent of y is FF
+  beq     LOCAL_LABEL(uncommon)        // branch out of line to handle inf/NaN/0/denorm
 
   // Calculate the sign of the result, and put it in an unused bit of r2.
-  TEQ     r0, r1               // sets N to the XOR of x and y's sign bits
-  ORRMI   r2, r2, #0x100       // if N set, set bit 8 of r2
+  teq     r0, r1               // sets N to the XOR of x and y's sign bits
+  orrmi   r2, r2, #0x100       // if N set, set bit 8 of r2
 
   // Move the input mantissas to the high end of r0/r1, each with its leading
   // bit set explicitly, so that they're in the right form to be multiplied.
-  MOV     r12, #0x80000000
-  ORR     r0, r12, r0, LSL #8
-  ORR     r1, r12, r1, LSL #8
+  mov     r12, #0x80000000
+  orr     r0, r12, r0, lsl #8
+  orr     r1, r12, r1, lsl #8
 
   // Now we're ready to multiply mantissas. This is also the place we'll come
   // back to after decoding denormal inputs. The denormal decoding will also
@@ -60,9 +60,9 @@ LOCAL_LABEL(mul):
   // The exponent is rebiased by subtracting 0x80, rather than the 0x7F you'd
   // expect. That compensates for the leading bit of the mantissa overlapping
   // it, when we recombine the exponent and mantissa by addition.
-  ADD     r2, r2, r3           // r2 has sum of exponents, freeing up r3
-  UMULL   r1, r3, r0, r1       // r3:r1 has the double-width product
-  SUB     r2, r2, #(0x80 << 16) // rebias the summed exponent
+  add     r2, r2, r3           // r2 has sum of exponents, freeing up r3
+  umull   r1, r3, r0, r1       // r3:r1 has the double-width product
+  sub     r2, r2, #(0x80 << 16) // rebias the summed exponent
 
   // Compress the double-word product into just the high-order word r3, by
   // setting its bit 0 if any bit of the low-order word is nonzero. This
@@ -70,16 +70,16 @@ LOCAL_LABEL(mul):
   // rounding, because rounding only depends on the bit below the last output
   // bit, and the general question of whether _any_ nonzero bit exists below
   // that.
-  CMP     r1, #0                // if low word of full product is nonzero
-  ORRNE   r3, r3, #1            //   then set LSB of high word
+  cmp     r1, #0                // if low word of full product is nonzero
+  orrne   r3, r3, #1            //   then set LSB of high word
 
   // The two inputs to UMULL had their high bits set, that is, were at least
   // 0x80000000. So the 64-bit product was at least 0x4000000000000000, i.e.
   // the high bit of the product could be at the top of the word or one bit
   // below. Check which, by experimentally shifting left, and then undoing it
   // via RRX if we turned out to have shifted off a 1 bit.
-  LSLS    r3, r3, #1            // shift left, setting C to the bit shifted off
-  RRXCS   r3, r3                // if that bit was 1, put it back again
+  lsls    r3, r3, #1            // shift left, setting C to the bit shifted off
+  rrxcs   r3, r3                // if that bit was 1, put it back again
 
   // That ensured the leading 1 bit of the product is now the top of r3, but
   // also, set C if the leading 1 was _already_ in the top bit. So now we know
@@ -92,14 +92,14 @@ LOCAL_LABEL(mul):
   // are exactly what we need to combine with the rounded mantissa. But the
   // full output exponent (with extra bits) is still available in the high half
   // of r2, so that we can check _whether_ we overflowed or underflowed.
-  ADC     r2, r2, r2, ASR #16
+  adc     r2, r2, r2, asr #16
 
   // Recombine the exponent and mantissa, doing most of the rounding as a side
   // effect: we shift the mantissa right so as to put the round bit into C, and
   // then we recombine with the exponent using ADC, to increment the mantissa
   // if C was set.
-  MOVS    r12, r3, LSR #8
-  ADC     r0, r12, r2, LSL #23
+  movs    r12, r3, lsr #8
+  adc     r0, r12, r2, lsl #23
 
   // To complete the rounding, we must check for the round-to-even tiebreaking
   // case, by checking if we're in the exact halfway case, which occurs if and
@@ -130,8 +130,8 @@ LOCAL_LABEL(mul):
   // catches a few very large cases that _don't_ quite overflow (exponents of
   // 0xFC and above that don't get maximally unlucky); those will also be
   // handled by the slow path.
-  TSTCS   r3, #0x7F
-  CMPNE   r2, #0xFC0000
+  tstcs   r3, #0x7F
+  cmpne   r2, #0xFC0000
 #else
   // In Thumb, switching between different conditions has a higher cost due to
   // the (implicit in this code) IT instructions, so we prefer a strategy that
@@ -155,13 +155,13 @@ LOCAL_LABEL(mul):
   //
   // The check for over/underflow is exactly as in the Arm branch above, except
   // based on a different condition.
-  CMPCS   r12, r3, LSL #25  // now C is set iff we're rounding to even
-  CMPCC   r2, #0xFC0000     // and now it's also set if we've over/underflowed
+  cmpcs   r12, r3, lsl #25  // now C is set iff we're rounding to even
+  cmpcc   r2, #0xFC0000     // and now it's also set if we've over/underflowed
 #endif
 
   // That's all the checks for difficult cases done. If C is clear, we can
   // return.
-  BXCC    lr
+  bxcc    lr
 
   // Now the slower path begins. We have to recover enough information to
   // handle all of round-to-even, overflow and underflow.
@@ -175,20 +175,20 @@ LOCAL_LABEL(mul):
   // leave any unambiguous indicator of RTE, so we must retest by checking all
   // the bits shifted off the bottom of the mantissa to see if they're exactly
   // the half-way value.
-  LSL     r12, r3, #24           // r12 = round bit and everything below
-  CMP     r12, #0x80000000       // set Z if that is exactly 0x80000000
+  lsl     r12, r3, #24           // r12 = round bit and everything below
+  cmp     r12, #0x80000000       // set Z if that is exactly 0x80000000
 #endif
 
   // Now Z is clear iff we have already rounded up and now must replace that
   // with rounding to even, which is done by just clearing the low bit of the
   // mantissa.
-  BICEQ   r0, r0, #1
+  biceq   r0, r0, #1
 
   // Redo the over/underflow check (the same way as in both branches above),
   // and if it doesn't report a danger, we can return the rounded-to-even
   // answer.
-  CMP     r2, #0xFC0000         // check for over/underflow
-  BXCC    lr                    // and return if none.
+  cmp     r2, #0xFC0000         // check for over/underflow
+  bxcc    lr                    // and return if none.
 
   // Now we only have overflow and underflow left to handle. First, find out
   // which we're looking at. This is easy by testing the top bit of r2, but
@@ -197,21 +197,21 @@ LOCAL_LABEL(mul):
   // the CMP above won't have made any difference. So the N flag output from
   // that comparison _already_ tells us which condition we have: if N is set we
   // have underflow, and if N is clear, overflow.
-  BPL     LOCAL_LABEL(overflow)
+  bpl     LOCAL_LABEL(overflow)
 
   // Here we're handling underflow.
 
   // Add the IEEE 754:1985 exponent bias which funder will expect. This also
   // brings the exponent back into a range where it can't possibly have carried
   // into the sign bit, so the output sign will now be right.
-  ADD     r0, r0, #(0xC0 << 23)
+  add     r0, r0, #(0xC0 << 23)
 
   // Determine whether we rounded up, down or not at all.
-  LSLS    r2, r3, #1              // input mantissa, without its leading 1
-  SUBS    r1, r2, r0, LSL #9      // subtract the output mantissa (likewise)
+  lsls    r2, r3, #1              // input mantissa, without its leading 1
+  subs    r1, r2, r0, lsl #9      // subtract the output mantissa (likewise)
 
   // And let funder handle the rest.
-  B     SYMBOL_NAME(__compiler_rt_funder)
+  b     SYMBOL_NAME(__compiler_rt_funder)
 
 LOCAL_LABEL(overflow):
   // We come here to handle overflow, but it's not guaranteed that an overflow
@@ -222,22 +222,22 @@ LOCAL_LABEL(overflow):
   // the high bit clear, are overflows; 0xFE down to 0xFC are not overflows.
   //
   // The value in r0 is correct to return, if there's no overflow.
-  ADD     r12, r0, #(1 << 23)     // add 1 to the exponent so 0xFF wraps to 0
-  MOVS    r12, r12, LSL #1        // test the top bit of the modified value
-  BXMI    lr                      // if top bit is still 1, not an overflow
+  add     r12, r0, #(1 << 23)     // add 1 to the exponent so 0xFF wraps to 0
+  movs    r12, r12, lsl #1        // test the top bit of the modified value
+  bxmi    lr                      // if top bit is still 1, not an overflow
 
   // This is an overflow, so we need to replace it with an appropriately signed
   // infinity. First we correct the sign by applying a downward bias to the
   // exponent (the one suggested in IEEE 754:1985, which was chosen to bring
   // all possible overflowed results back into range).
-  SUBS    r0, r0, #(0xC0 << 23)
+  subs    r0, r0, #(0xC0 << 23)
 
   // Now the sign bit of r0 is correct. Replace everything else with the
   // encoding of an infinity.
-  MOV     r1, #0xFF
-  AND     r0, r0, #0x80000000
-  ORR     r0, r0, r1, LSL #23
-  BX      lr
+  mov     r1, #0xFF
+  and     r0, r0, #0x80000000
+  orr     r0, r0, r1, lsl #23
+  bx      lr
 
 LOCAL_LABEL(uncommon):
   // Handle zeros, denorms, infinities and NaNs. We arrive here knowing that
@@ -248,37 +248,37 @@ LOCAL_LABEL(uncommon):
   // So, first repeat some instructions from the prologue, which were either
   // conditionally skipped in the sequence leading to the branch, or skipped
   // because they happened after the branch.
-  AND     r3, r12, r1, LSR #7  // get exponent of y in r3 bits 16..23
-  TEQ     r0, r1               // calculate the sign of the result
-  ORRMI   r2, r2, #0x100       // and put it in bit 8 of r2 as before
+  and     r3, r12, r1, lsr #7  // get exponent of y in r3 bits 16..23
+  teq     r0, r1               // calculate the sign of the result
+  orrmi   r2, r2, #0x100       // and put it in bit 8 of r2 as before
 
   // Check for infinities and NaNs, by testing each of r2,r3 to see if it's at
   // least 0xFF0000 (hence the exponent field is equal to 0xFF).
-  CMP     r2, r12
-  CMPLO   r3, r12
-  BHS     LOCAL_LABEL(inf_NaN)
+  cmp     r2, r12
+  cmplo   r3, r12
+  bhs     LOCAL_LABEL(inf_NaN)
 
   // If we didn't take that branch, then we have only finite numbers, but at
   // least one is denormal or zero. A zero makes the result easy (and also is a
   // more likely input than a denormal), so check those first, as fast as
   // possible.
-  MOVS    r12, r0, LSL #1          // Z set if x == 0
-  MOVSNE  r12, r1, LSL #1          // now Z set if either input is 0
-  MOVEQ   r0, r2, LSL #23          // in either case, make 0 of the output sign
-  BXEQ    lr                       // and return it
+  movs    r12, r0, lsl #1          // Z set if x == 0
+  movsne  r12, r1, lsl #1          // now Z set if either input is 0
+  moveq   r0, r2, lsl #23          // in either case, make 0 of the output sign
+  bxeq    lr                       // and return it
 
   // Now we know we only have denormals to deal with. Call fnorm2 to sort
   // them out, and rejoin the main code path above.
-  AND     r12, r2, #0x100          // save the result sign from r2
-  LSR     r2, #16                  // shift extracted exponents down to bit 0
-  LSR     r3, #16                  // where fnorm2 will expect them
-  PUSH    {r0, r1, r2, r3, r12, lr}
-  MOV     r0, sp                   // tell fnorm2 where to find its data
-  BL      SYMBOL_NAME(__compiler_rt_fnorm2)
-  POP     {r0, r1, r2, r3, r12, lr}
-  LSL     r3, #16                  // shift exponents back up to bit 16
-  ORR     r2, r12, r2, LSL #16     // and put the result sign back in r2
-  B       LOCAL_LABEL(mul)
+  and     r12, r2, #0x100          // save the result sign from r2
+  lsr     r2, #16                  // shift extracted exponents down to bit 0
+  lsr     r3, #16                  // where fnorm2 will expect them
+  push    {r0, r1, r2, r3, r12, lr}
+  mov     r0, sp                   // tell fnorm2 where to find its data
+  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
+  pop     {r0, r1, r2, r3, r12, lr}
+  lsl     r3, #16                  // shift exponents back up to bit 16
+  orr     r2, r12, r2, lsl #16     // and put the result sign back in r2
+  b       LOCAL_LABEL(mul)
 
 LOCAL_LABEL(inf_NaN):
   // We come here if at least one input is a NaN or infinity. If either or both
@@ -286,23 +286,23 @@ LOCAL_LABEL(inf_NaN):
   // the input; otherwise any multiplication involving infinity returns
   // infinity, unless it's infinity * 0 which is an invalid operation and
   // returns NaN again.
-  MOV     r12, #0xFF000000
-  CMP     r12, r0, LSL #1          // if (r0 << 1) > 0xFF000000, r0 is a NaN
-  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
-  CMP     r12, r1, LSL #1
-  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
+  mov     r12, #0xFF000000
+  cmp     r12, r0, lsl #1          // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+  cmp     r12, r1, lsl #1
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
 
   // NaNs are dealt with, so now we have at least one infinity. Check if the
   // other operand is 0. This is conveniently done by XORing the two: because
   // we know that the low 31 bits of one operand are exactly 0x7F800000, we can
   // test if the low 31 bits of the other one are all 0 by checking whether the
   // low 31 bits of (x XOR y) equal 0x7F800000.
-  EOR     r3, r0, r1
-  CMP     r12, r3, LSL #1          // if inf * 0, this sets Z
-  LSR     r0, r12, #1              // set up return value of +infinity
-  ORRNE   r0, r0, r2, LSL #23      // if not inf * 0, put on the output sign
-  ORREQ   r0, r0, #0x400000        // otherwise, set the 'quiet NaN' bit
-  BX      lr                       // and return
+  eor     r3, r0, r1
+  cmp     r12, r3, lsl #1          // if inf * 0, this sets Z
+  lsr     r0, r12, #1              // set up return value of +infinity
+  orrne   r0, r0, r2, lsl #23      // if not inf * 0, put on the output sign
+  orreq   r0, r0, #0x400000        // otherwise, set the 'quiet NaN' bit
+  bx      lr                       // and return
 
 END_COMPILERRT_FUNCTION(__mulsf3)
 
diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
index dc6b1f59adbec..f2ede1013a9e6 100644
--- a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
+++ b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S
@@ -22,33 +22,33 @@
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3)
 
 DEFINE_COMPILERRT_THUMB_FUNCTION(__mulsf3)
-  PUSH {r4,r5,r6,lr}
+  push {r4,r5,r6,lr}
 
   // Get exponents of the inputs, and check for uncommon values. In the process
   // of this we also compute the sign, because it's marginally quicker that
   // way.
-  LSLS    r2, r0, #1
-  ADCS    r4, r4, r4    // set r4[0] to sign bit of x
-  LSLS    r3, r1, #1
-  ADCS    r4, r4, r3    // set r4[0] to the output sign
-  LSRS    r2, r2, #24
-  BEQ     LOCAL_LABEL(zerodenorm0)   // still do the next LSRS
-  LSRS    r3, r3, #24
-  BEQ     LOCAL_LABEL(zerodenorm)
-  CMP     r2, #255
-  BEQ     LOCAL_LABEL(naninf)
-  CMP     r3, #255
-  BEQ     LOCAL_LABEL(naninf)
+  lsls    r2, r0, #1
+  adcs    r4, r4, r4    // set r4[0] to sign bit of x
+  lsls    r3, r1, #1
+  adcs    r4, r4, r3    // set r4[0] to the output sign
+  lsrs    r2, r2, #24
+  beq     LOCAL_LABEL(zerodenorm0)   // still do the next LSRS
+  lsrs    r3, r3, #24
+  beq     LOCAL_LABEL(zerodenorm)
+  cmp     r2, #255
+  beq     LOCAL_LABEL(naninf)
+  cmp     r3, #255
+  beq     LOCAL_LABEL(naninf)
   // Compute the output exponent. We'll be generating our product _without_ the
   // leading bit, so we subtract 0x7f rather than 0x80.
-  ADDS    r2, r2, r3
-  SUBS    r2, r2, #0x7f
+  adds    r2, r2, r3
+  subs    r2, r2, #0x7f
   // Blank off everything above the mantissas.
-  LSLS    r0, r0, #9
-  LSLS    r1, r1, #9
+  lsls    r0, r0, #9
+  lsls    r1, r1, #9
 LOCAL_LABEL(normalised): // we may come back here from zerodenorm
-  LSRS    r0, r0, #9
-  LSRS    r1, r1, #9
+  lsrs    r0, r0, #9
+  lsrs    r1, r1, #9
   // Multiply. r0 and r1 are the mantissas of the inputs but without their
   // leading bits, so the product we want in principle is P=(r0+2^23)(r1+2^23).
   // P is at most (2^24-1)^2 < 2^48, so it fits in a word and a half.
@@ -63,55 +63,55 @@ LOCAL_LABEL(normalised): // we may come back here from zerodenorm
   // without their leading bits). I'll also decompose them as X = xh + xl and
   // Y = yh + yl, where xl and yl are in the range 0..2^8-1 and xh,yh are
   // multiples of 2^8.
-  ADDS    r5, r0, r1
-  LSLS    r5, r5, #7    // r5 = (X+Y) << 7
-  MOVS    r6, r0
-  MULS    r6, r1, r6    // r6 is congruent mod 2^32 to X*Y
-  LSRS    r0, r0, #8
-  LSRS    r1, r1, #8
-  MULS    r0, r1, r0
-  LSLS    r1, r0, #16   // r1 is congruent mod 2^32 to xh*yh
-  SUBS    r3, r6, r1    // now r3 is congruent mod 2^32 to
+  adds    r5, r0, r1
+  lsls    r5, r5, #7    // r5 = (X+Y) << 7
+  movs    r6, r0
+  muls    r6, r1, r6    // r6 is congruent mod 2^32 to X*Y
+  lsrs    r0, r0, #8
+  lsrs    r1, r1, #8
+  muls    r0, r1, r0
+  lsls    r1, r0, #16   // r1 is congruent mod 2^32 to xh*yh
+  subs    r3, r6, r1    // now r3 is congruent mod 2^32 to
                         //   (X*Y) - (xh*yh) = xh*yl + xl*yh + xl*yl
                         //   and hence, since that is at most 0xfeff0001,
                         //   is _exactly_ equal to that
-  ADDS    r0, r0, r5    // r0 is now (xh*yh + (X+Y)<<23) >> 16
-  LSRS    r1, r3, #16   // r1 is the top 16 bits of r3, i.e.
+  adds    r0, r0, r5    // r0 is now (xh*yh + (X+Y)<<23) >> 16
+  lsrs    r1, r3, #16   // r1 is the top 16 bits of r3, i.e.
                         //   (xh*yl + xl*yh + xl*yl) >> 16
-  ADDS    r3, r0, r1    // now r3 equals
+  adds    r3, r0, r1    // now r3 equals
                         //   (xh*yh + xh*yl + xl*yh + xl*yl + (X+Y)<<23) >> 16
                         //   i.e. (X*Y + (X+Y)<<23) >> 16,
                         //   i.e. (the right answer) >> 16.
                         // Meanwhile, r6 is exactly the bottom 32 bits of the
                         // right answer.
   // Renormalise if necessary.
-  LSRS    r1, r3, #30
-  BEQ     LOCAL_LABEL(norenorm)
+  lsrs    r1, r3, #30
+  beq     LOCAL_LABEL(norenorm)
   // Here we have to do something fiddly. Renormalisation would be a trivial
   // job if we had the leading mantissa bit - just note that it's one bit
   // position above where it should be, and shift right by one. But without
   // that bit, we currently have (2x - 2^30), and we want (x - 2^30); just
   // shifting right would of course give us (x - 2^29), so we must subtract an
   // extra 2^29 to fix this up.
-  LSRS    r3, r3, #1
-  MOVS    r1, #1
-  LSLS    r1, r1, #29
-  SUBS    r3, r3, r1
-  ADDS    r2, r2, #1
+  lsrs    r3, r3, #1
+  movs    r1, #1
+  lsls    r1, r1, #29
+  subs    r3, r3, r1
+  adds    r2, r2, #1
 LOCAL_LABEL(norenorm):
   // Round and shift down to the right bit position.
-  LSRS    r0, r3, #7    // round bit goes into the carry flag
-  BCC     LOCAL_LABEL(rounded)
-  ADDS    r0, r0, #1
+  lsrs    r0, r3, #7    // round bit goes into the carry flag
+  bcc     LOCAL_LABEL(rounded)
+  adds    r0, r0, #1
   // In the round-up branch, we must also check if we have to round to even, by
   // testing all the bits below the round bit. We will normally not expect to,
   // so we do RTE by branching out of line and back again to avoid spending a
   // branch in the common case.
-  LSLS    r5, r3, #32-7+1  // check the bits shifted out of r3 above
-  BNE     LOCAL_LABEL(rounded)          // if any is nonzero, we're not rounding to even
-  LSLS    r5, r6, #15      // check the bottom 17 bits of the low-order 32
+  lsls    r5, r3, #32-7+1  // check the bits shifted out of r3 above
+  bne     LOCAL_LABEL(rounded)          // if any is nonzero, we're not rounding to even
+  lsls    r5, r6, #15      // check the bottom 17 bits of the low-order 32
                            //   (enough to overlap r3 even if we renormalised)
-  BEQ     LOCAL_LABEL(rte)              // if any is nonzero, fall through, else RTE
+  beq     LOCAL_LABEL(rte)              // if any is nonzero, fall through, else RTE
 LOCAL_LABEL(rounded):
   // Put on the sign and exponent, check for underflow and overflow, and
   // return.
@@ -123,52 +123,52 @@ LOCAL_LABEL(rounded):
   // via an unsigned comparison against 0xFF, which leaves the one case of a
   // zero exponent that we have to filter separately by testing the Z flag
   // after we shift the exponent back up into place.
-  CMP     r2, #0xFF    // check for most over/underflows
-  BHS     LOCAL_LABEL(outflow)      // ... and branch out of line for them
-  LSLS    r5, r2, #23  // shift the exponent into its output location
-  BEQ     LOCAL_LABEL(outflow)      // ... and branch again if it was 0
-  LSLS    r4, r4, #31  // shift the output sign into place
-  ORRS    r0, r0, r4   // and OR it in to the output
-  ADDS    r0, r0, r5   // OR in the mantissa
-  POP     {r4,r5,r6,pc} // and return
+  cmp     r2, #0xFF    // check for most over/underflows
+  bhs     LOCAL_LABEL(outflow)      // ... and branch out of line for them
+  lsls    r5, r2, #23  // shift the exponent into its output location
+  beq     LOCAL_LABEL(outflow)      // ... and branch again if it was 0
+  lsls    r4, r4, #31  // shift the output sign into place
+  orrs    r0, r0, r4   // and OR it in to the output
+  adds    r0, r0, r5   // OR in the mantissa
+  pop     {r4,r5,r6,pc} // and return
 
 LOCAL_LABEL(rte):
   // Out-of-line handler for the round-to-even case. Clear the low mantissa bit
   // and go back to the post-rounding code.
-  MOVS    r5, #1
-  BICS    r0, r0, r5
-  B       LOCAL_LABEL(rounded)
+  movs    r5, #1
+  bics    r0, r0, r5
+  b       LOCAL_LABEL(rounded)
 
 LOCAL_LABEL(outflow):
-  CMP     r2, #0
-  BGT     LOCAL_LABEL(overflow)
+  cmp     r2, #0
+  bgt     LOCAL_LABEL(overflow)
   // To handle underflow, we construct an intermediate value in the IEEE 754
   // style (using our existing full-length mantissa, and bias the exponent by
   // +0xC0), and indicate whether that intermediate was rounded up, down or not
   // at all. Then call the helper function funder, which will denormalise and
   // re-round correctly.
-  LSLS    r1, r0, #7    // shift up the post-rounding mantissa
-  SUBS    r1, r3, r1    //   and subtract it from the pre-rounding version
-  LSLS    r6, r6, #15
-  CMP     r6, #1        // if the rest of the low bits are nonzero
-  ADCS    r1, r1, r1    //   then set an extra bit at the bottom
+  lsls    r1, r0, #7    // shift up the post-rounding mantissa
+  subs    r1, r3, r1    //   and subtract it from the pre-rounding version
+  lsls    r6, r6, #15
+  cmp     r6, #1        // if the rest of the low bits are nonzero
+  adcs    r1, r1, r1    //   then set an extra bit at the bottom
 
-  LSLS    r4, r4, #31
-  ORRS    r0, r0, r4    // put on the sign
-  ADDS    r2, r2, #192  // bias the exponent
-  LSLS    r3, r2, #23
-  ADDS    r0, r0, r3    // put on the biased exponent
+  lsls    r4, r4, #31
+  orrs    r0, r0, r4    // put on the sign
+  adds    r2, r2, #192  // bias the exponent
+  lsls    r3, r2, #23
+  adds    r0, r0, r3    // put on the biased exponent
 
-  BL      SYMBOL_NAME(__compiler_rt_funder)
-  POP     {r4,r5,r6,pc}
+  bl      SYMBOL_NAME(__compiler_rt_funder)
+  pop     {r4,r5,r6,pc}
 
 LOCAL_LABEL(overflow):
   // Handle overflow by returning an infinity of the correct sign.
-  LSLS    r4, r4, #8    // move the sign up to bit 8
-  MOVS    r0, #0xff
-  ORRS    r0, r0, r4    // fill in an exponent just below it
-  LSLS    r0, r0, #23   // and shift those 9 bits up to the top of the word
-  POP     {r4,r5,r6,pc}
+  lsls    r4, r4, #8    // move the sign up to bit 8
+  movs    r0, #0xff
+  orrs    r0, r0, r4    // fill in an exponent just below it
+  lsls    r0, r0, #23   // and shift those 9 bits up to the top of the word
+  pop     {r4,r5,r6,pc}
 
   // We come here if there's at least one zero or denormal. On the fast path
   // above, it was convenient to check these before checking NaNs and
@@ -179,72 +179,72 @@ LOCAL_LABEL(overflow):
   // exponents. So if we branched after shifting-and-checking r2, we come to
   // this earlier entry point 'zerodenorm0' so that we still shift r3.
 LOCAL_LABEL(zerodenorm0):
-  LSRS    r3, r3, #24
+  lsrs    r3, r3, #24
 LOCAL_LABEL(zerodenorm):
-  CMP     r2, #255
-  BEQ     LOCAL_LABEL(naninf)
-  CMP     r3, #255
-  BEQ     LOCAL_LABEL(naninf)
+  cmp     r2, #255
+  beq     LOCAL_LABEL(naninf)
+  cmp     r3, #255
+  beq     LOCAL_LABEL(naninf)
   // Now we know we have at least one zero or denormal, and no NaN or infinity.
   // Check if either input is actually zero. We've ruled out 0 * infinity by
   // this point, so any zero input means we return zero of the correct sign.
-  LSLS    r6, r0, #1        // is one input zero?
-  BEQ     LOCAL_LABEL(zero)              // yes, go and return zero
-  LSLS    r6, r1, #1        // is the other one zero?
-  BNE     LOCAL_LABEL(denorm)            // if not, one must have been a denormal
+  lsls    r6, r0, #1        // is one input zero?
+  beq     LOCAL_LABEL(zero)              // yes, go and return zero
+  lsls    r6, r1, #1        // is the other one zero?
+  bne     LOCAL_LABEL(denorm)            // if not, one must have been a denormal
 LOCAL_LABEL(zero):
-  LSLS    r0, r4, #31    // shift up the output sign to make the return value
-  POP     {r4,r5,r6,pc}
+  lsls    r0, r4, #31    // shift up the output sign to make the return value
+  pop     {r4,r5,r6,pc}
 
   // Handle denormals via the helper function fnorm2, which will break both
   // inputs up into mantissa and exponent, renormalising and generating a
   // negative exponent if necessary.
 LOCAL_LABEL(denorm):
-  PUSH    {r0,r1,r2,r3}
-  MOV     r0, sp
-  BL      SYMBOL_NAME(__compiler_rt_fnorm2)
-  POP     {r0,r1,r2,r3}
+  push    {r0,r1,r2,r3}
+  mov     r0, sp
+  bl      SYMBOL_NAME(__compiler_rt_fnorm2)
+  pop     {r0,r1,r2,r3}
   // Convert fnorm2's return values into the right form to rejoin the main
   // code path.
-  LSLS    r0, r0, #1
-  LSLS    r1, r1, #1
-  ADDS    r2, r2, r3
-  SUBS    r2, r2, #0x7f
-  B       LOCAL_LABEL(normalised)
+  lsls    r0, r0, #1
+  lsls    r1, r1, #1
+  adds    r2, r2, r3
+  subs    r2, r2, #0x7f
+  b       LOCAL_LABEL(normalised)
 
   // We come here if at least one input is a NaN or infinity. There may still
   // be zeroes (or denormals, though they make no difference at this stage).
 LOCAL_LABEL(naninf):
-  MOVS    r6, #0xff
-  LSLS    r6, r6, #24
-  LSLS    r5, r0, #1
-  CMP     r5, r6
-  BHI     LOCAL_LABEL(nan)              // first operand is a NaN
-  LSLS    r5, r1, #1
-  CMP     r5, r6
-  BHI     LOCAL_LABEL(nan)              // second operand is a NaN
+  movs    r6, #0xff
+  lsls    r6, r6, #24
+  lsls    r5, r0, #1
+  cmp     r5, r6
+  bhi     LOCAL_LABEL(nan)              // first operand is a NaN
+  lsls    r5, r1, #1
+  cmp     r5, r6
+  bhi     LOCAL_LABEL(nan)              // second operand is a NaN
 
   // We know we have at least one infinity, and no NaNs. We might also have a
   // zero, in which case we return the default quiet NaN.
-  LSLS    r6, r0, #1
-  BEQ     LOCAL_LABEL(infzero)          // if r0 is a zero, r1 must be inf
-  LSLS    r6, r1, #1
-  BEQ     LOCAL_LABEL(infzero)          // if r1 is a zero, r0 must be inf
+  lsls    r6, r0, #1
+  beq     LOCAL_LABEL(infzero)          // if r0 is a zero, r1 must be inf
+  lsls    r6, r1, #1
+  beq     LOCAL_LABEL(infzero)          // if r1 is a zero, r0 must be inf
   // Otherwise we have infinity * infinity, or infinity * finite. Just return
   // an appropriately signed infinity.
-  B       LOCAL_LABEL(overflow)         // reuse the code there
+  b       LOCAL_LABEL(overflow)         // reuse the code there
 
   // We come here if at least one input is a NaN. Hand off to fnan2, which
   // propagates an appropriate NaN to the output, dealing with the special
   // cases of signalling/quiet NaNs.
 LOCAL_LABEL(nan):
-  BL      SYMBOL_NAME(__compiler_rt_fnan2)
-  POP     {r4,r5,r6,pc}
+  bl      SYMBOL_NAME(__compiler_rt_fnan2)
+  pop     {r4,r5,r6,pc}
 
   // Return a quiet NaN as the result of infinity * zero.
 LOCAL_LABEL(infzero):
-  LDR     r0, =0x7fc00000
-  POP     {r4,r5,r6,pc}
+  ldr     r0, =0x7fc00000
+  pop     {r4,r5,r6,pc}
 
 END_COMPILERRT_FUNCTION(__mulsf3)
 

From 40e362146d8ee8cf248fe57175fda56dc4079cb6 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 2 Oct 2025 13:49:50 +0100
Subject: [PATCH 7/8] Mention size/speed tradeoff in the cmake option help

---
 compiler-rt/lib/builtins/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 5c577c19f25a3..a54abc64d381f 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -423,7 +423,7 @@ set(arm_or_thumb2_base_SOURCES
 )
 
 option(COMPILER_RT_ARM_OPTIMIZED_FP
-  "On 32-bit Arm, use optimized assembly implementations of FP arithmetic" ON)
+  "On 32-bit Arm, use optimized assembly implementations of FP arithmetic. Likely to increase code size, but be faster." ON)
 
 if(COMPILER_RT_ARM_OPTIMIZED_FP)
   set(arm_or_thumb2_base_SOURCES

From 66a3bcb6f234cc107c21331add001a77cde1407f Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Tue, 7 Oct 2025 15:53:12 +0100
Subject: [PATCH 8/8] Update build and test setup

Now we should only test the extra NaN faithfulness in cases where it's
provided by the library. Also tweaked the cmake setup to make it
easier to add more assembly files later. Plus a missing piece of
comment in fnan2.c.
---
 compiler-rt/lib/builtins/CMakeLists.txt      | 10 ++--
 compiler-rt/lib/builtins/arm/fnan2.c         |  5 ++
 compiler-rt/test/builtins/CMakeLists.txt     |  4 ++
 compiler-rt/test/builtins/Unit/divsf3_test.c | 48 +++++++++++++++++---
 compiler-rt/test/builtins/Unit/mulsf3_test.c | 48 ++++++++++++++++----
 5 files changed, 97 insertions(+), 18 deletions(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index a54abc64d381f..b622d226a6569 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -426,17 +426,19 @@ option(COMPILER_RT_ARM_OPTIMIZED_FP
   "On 32-bit Arm, use optimized assembly implementations of FP arithmetic. Likely to increase code size, but be faster." ON)
 
 if(COMPILER_RT_ARM_OPTIMIZED_FP)
-  set(arm_or_thumb2_base_SOURCES
+  set(assembly_files
     arm/mulsf3.S
-    arm/divsf3.S
+    arm/divsf3.S)
+  set_source_files_properties(${assembly_files}
+    PROPERTIES COMPILE_OPTIONS "-Wa,-mimplicit-it=always")
+  set(arm_or_thumb2_base_SOURCES
+    ${assembly_files}
     arm/fnan2.c
     arm/fnorm2.c
     arm/funder.c
     ${arm_or_thumb2_base_SOURCES}
   )
 endif()
-set_source_files_properties(arm/mulsf3.S arm/divsf3.S
-  PROPERTIES COMPILE_OPTIONS "-Wa,-mimplicit-it=always")
 
 set(arm_sync_SOURCES
   arm/sync_fetch_and_add_4.S
diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c
index c2fbfa3974d6e..06bbd4339f171 100644
--- a/compiler-rt/lib/builtins/arm/fnan2.c
+++ b/compiler-rt/lib/builtins/arm/fnan2.c
@@ -13,6 +13,11 @@
 // On input, a and b are floating-point numbers in IEEE 754 encoding, and at
 // least one of them must be a NaN. The return value is the correct output NaN.
 //
+// A signalling NaN in the input (with bit 22 clear) takes priority over any
+// quiet NaN, and is adjusted on return by setting bit 22 to make it quiet. If
+// both inputs are the same type of NaN then the first input takes priority:
+// the input a is used instead of b.
+//
 //===----------------------------------------------------------------------===//
 
 #include <stdint.h>
diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt
index 63f4c94605c90..8e3cb35183ba7 100644
--- a/compiler-rt/test/builtins/CMakeLists.txt
+++ b/compiler-rt/test/builtins/CMakeLists.txt
@@ -35,6 +35,10 @@ if(APPLE)
   darwin_filter_host_archs(BUILTIN_SUPPORTED_ARCH BUILTIN_TEST_ARCH)
 endif()
 
+if(COMPILER_RT_ARM_OPTIMIZED_FP)
+  list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_ARM_OPTIMIZED_FP)
+endif()
+
 foreach(arch ${BUILTIN_TEST_ARCH})
   set(BUILTINS_TEST_TARGET_ARCH ${arch})
   string(TOLOWER "-${arch}-${OS_NAME}" BUILTINS_TEST_CONFIG_SUFFIX)
diff --git a/compiler-rt/test/builtins/Unit/divsf3_test.c b/compiler-rt/test/builtins/Unit/divsf3_test.c
index b5ebb2d9b5093..12c5df5fdaae1 100644
--- a/compiler-rt/test/builtins/Unit/divsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/divsf3_test.c
@@ -11,13 +11,27 @@
 
 #include "fp_test.h"
 
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more
+// detailed handling of NaNs, we tighten up the check and include some extra
+// test cases specific to that NaN policy.
+#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
 // Returns: a / b
 COMPILER_RT_ABI float __divsf3(float a, float b);
 
 int test__divsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
   float a = fromRep32(a_rep), b = fromRep32(b_rep);
   float x = __divsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep32(x) == expected_rep;
+#else
   int ret = compareResultF(x, expected_rep);
+#endif
 
   if (ret) {
     printf("error in test__divsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
@@ -309,10 +323,29 @@ int main(void) {
   status |= test__divsf3(0x2cbed883, 0x333f6113, 0x38ff4953);
   status |= test__divsf3(0x3f87ffff, 0x7f001000, 0x0043f781);
 
-#if __thumb__ && !__thumb2__
-  // These tests depend on Arm-specific IEEE 754 implementation choices
-  // regarding NaNs, which are satisfied by arm/mulsf3.S but not guaranteed by
-  // other implementations:
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+  // which causes compareResultF to accept any NaN encoding. We also use the
+  // same value as the input NaN in tests that have one, so that even in
+  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+  // still the exact expected NaN.
+  status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000);
+  status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000);
+  status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000);
+  status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000);
+  status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000);
+  status |= test__divsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+  status |= test__divsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+  status |= test__divsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/divsf3.S:
   //
   //  - a quiet NaN is distinguished by the top mantissa bit being 1
   //
@@ -324,7 +357,10 @@ int main(void) {
   //    from the first operand
   //
   //  - if both operands are quiet NaNs then the output NaN is the first
-  //    operand.
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7fc00000.
 
   status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000);
   status |= test__divsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
@@ -386,7 +422,7 @@ int main(void) {
   status |= test__divsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
   status |= test__divsf3(0xff800000, 0x7fde0397, 0x7fde0397);
   status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000);
-#endif // __arm__
+#endif // ARM_NAN_HANDLING
 
   return status;
 }
diff --git a/compiler-rt/test/builtins/Unit/mulsf3_test.c b/compiler-rt/test/builtins/Unit/mulsf3_test.c
index d18674e974149..7dc7c8ad39c32 100644
--- a/compiler-rt/test/builtins/Unit/mulsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/mulsf3_test.c
@@ -11,13 +11,27 @@
 
 #include "fp_test.h"
 
-// Returns: a + b
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more
+// detailed handling of NaNs, we tighten up the check and include some extra
+// test cases specific to that NaN policy.
+#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
+// Returns: a * b
 COMPILER_RT_ABI float __mulsf3(float a, float b);
 
 int test__mulsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
   float a = fromRep32(a_rep), b = fromRep32(b_rep);
   float x = __mulsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep32(x) == expected_rep;
+#else
   int ret = compareResultF(x, expected_rep);
+#endif
 
   if (ret) {
     printf("error in test__mulsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
@@ -27,7 +41,7 @@ int test__mulsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
   return ret;
 }
 
-int main() {
+int main(void) {
   int status = 0;
 
   status |= test__mulsf3(0x00000000, 0x00000000, 0x00000000);
@@ -505,10 +519,25 @@ int main() {
   status |= test__mulsf3(0x19ffc008, 0x1a002004, 0x00000001);
   status |= test__mulsf3(0x7f7ffff0, 0xc0000008, 0xff800000);
 
-#if __thumb__ && !__thumb2__
-  // These tests depend on Arm-specific IEEE 754 implementation choices
-  // regarding NaNs, which are satisfied by arm/mulsf3.S but not guaranteed by
-  // other implementations:
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+  // which causes compareResultF to accept any NaN encoding. We also use the
+  // same value as the input NaN in tests that have one, so that even in
+  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+  // still the exact expected NaN.
+  status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000);
+  status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000);
+  status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000);
+  status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000);
+  status |= test__mulsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+  status |= test__mulsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+  status |= test__mulsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/mulsf3.S:
   //
   //  - a quiet NaN is distinguished by the top mantissa bit being 1
   //
@@ -520,7 +549,10 @@ int main() {
   //    from the first operand
   //
   //  - if both operands are quiet NaNs then the output NaN is the first
-  //    operand.
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7fc00000.
 
   status |= test__mulsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
   status |= test__mulsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
@@ -578,7 +610,7 @@ int main() {
   status |= test__mulsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
   status |= test__mulsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
   status |= test__mulsf3(0xff800000, 0x7fde0397, 0x7fde0397);
-#endif // __arm__
+#endif // ARM_NAN_HANDLING
 
   return status;
 }