diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 9095b056ae782..b622d226a6569 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -422,6 +422,24 @@ set(arm_or_thumb2_base_SOURCES ${GENERIC_SOURCES} ) +option(COMPILER_RT_ARM_OPTIMIZED_FP + "On 32-bit Arm, use optimized assembly implementations of FP arithmetic. Likely to increase code size, but be faster." ON) + +if(COMPILER_RT_ARM_OPTIMIZED_FP) + set(assembly_files + arm/mulsf3.S + arm/divsf3.S) + set_source_files_properties(${assembly_files} + PROPERTIES COMPILE_OPTIONS "-Wa,-mimplicit-it=always") + set(arm_or_thumb2_base_SOURCES + ${assembly_files} + arm/fnan2.c + arm/fnorm2.c + arm/funder.c + ${arm_or_thumb2_base_SOURCES} + ) +endif() + set(arm_sync_SOURCES arm/sync_fetch_and_add_4.S arm/sync_fetch_and_add_8.S @@ -455,6 +473,16 @@ set(thumb1_base_SOURCES ${GENERIC_SOURCES} ) +if(COMPILER_RT_ARM_OPTIMIZED_FP) + set(thumb1_base_SOURCES + arm/thumb1/mulsf3.S + arm/fnan2.c + arm/fnorm2.c + arm/funder.c + ${thumb1_base_SOURCES} + ) +endif() + set(arm_EABI_RT_SOURCES arm/aeabi_cdcmp.S arm/aeabi_cdcmpeq_check_nan.c diff --git a/compiler-rt/lib/builtins/arm/divsf3.S b/compiler-rt/lib/builtins/arm/divsf3.S new file mode 100644 index 0000000000000..2f37234457b7b --- /dev/null +++ b/compiler-rt/lib/builtins/arm/divsf3.S @@ -0,0 +1,608 @@ +//===-- divsf3.S - single-precision floating point division ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements single-precision soft-float division with the IEEE-754 +// default rounding (to nearest, ties to even), in optimized AArch32 assembly +// language suitable to be built as either Arm or Thumb2. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + + .syntax unified + .text + .p2align 2 + +DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fdiv, __divsf3) + +DEFINE_COMPILERRT_FUNCTION(__divsf3) + // Extract the exponents of the inputs into r2 and r3, occupying bits 16-23 + // of each register so that there will be space lower down to store extra + // data without exponent arithmetic carrying into it. In the process, check + // both exponents for 00 or FF and branch out of line to handle all the + // uncommon types of value (infinity, NaN, zero, denormals). + // + // Chaining conditional instructions like this means that the second + // instruction (setting up r3) might not be executed at all, so fdiv_uncommon + // will have to redo it just in case. That saves an instruction here, + // executed for _all_ inputs, and moves it to the uncommon path run for only + // some inputs. + mov r12, #0xFF0000 + ands r2, r12, r0, lsr #7 // r2 has exponent of numerator. (Is it 0?) + andsne r3, r12, r1, lsr #7 // r3 has exponent of denominator. (Is it 0?) + teqne r2, r12 // if neither was 0, is one FF? + teqne r3, r12 // or the other? + beq LOCAL_LABEL(uncommon) // branch out of line if any answer was yes + + // Calculate the output sign, which is always just the XOR of the input + // signs. Store it in bit 8 of r2, below the numerator exponent. + teq r0, r1 // is the output sign bit 1? + orrmi r2, r2, #0x100 // if so, set bit 8 of r2 + + // Isolate the mantissas of both values, by setting bit 23 of each one and + // clearing the 8 bits above that. + // + // In the process, swap the register allocations (which doesn't cost extra + // instructions if we do it as part of this manipulation). We want the + // numerator not to be in r0, because r0 is where we'll build up the quotient + // while subtracting things from the numerator. + orr r12, r0, #1 << 23 + orr r0, r1, #1 << 23 + bic r1, r12, #0xFF000000 + bic r0, r0, #0xFF000000 + +LOCAL_LABEL(div): + // Start of the main division. We get here knowing that: + // + // r0 = mantissa of denominator, with the leading 1 at bit 23 + // r1 = mantissa of numerator, similarly + // r2 = (exponent of numerator << 16) + (result sign << 8) + // r3 = (exponent of denominator << 16) + + push {r14} // we'll need an extra register + + // Calculate the initial result exponent by just subtracting the two input + // exponents. This doesn't affect the sign bit lower down in r2. + sub r2, r2, r3 + + // That initial exponent might need to be adjusted by 1, depending on whether + // dividing the mantissas gives a value >=1 or <1. We don't need to wait + // until the division is finished to work that out: we can tell immediately + // by just comparing the mantissas. + // + // The basic idea is to do the comparison in a way that sets the C flag if + // numerator >= denominator. Then we recombine the sign and exponent by doing + // "ADC r2, r2, r2, asr #16": the exponent in the top half of r2 is shifted + // down to the low 8 bits, just below the sign bit, and using ADC rather than + // ADD folds in the conditional increment from the mantissa comparison. + // + // If we're not incrementing the output exponent, we instead shift the + // numerator mantissa left by 1, so that it _is_ greater than the denominator + // mantissa. Otherwise we'd generate only a 22-bit quotient, instead of 23. + // + // The exponent also needs to be rebiased, so that dividing two numbers the + // same gives an output exponent of 0x7F. If the two inputs have the same + // exponent then we'll have computed an exponent of 0 via the SUB instruction + // above; if the mantissas are the same as well then the ADC will increment + // it; also, the leading bit of the quotient will increment the exponent + // again when we recombine it with the output mantissa later. So we need to + // add (0x7F - 2) to the mantissa now, to make an exponent of 0 from the SUB + // come to 0x7F after both of those increments. + // + // Putting all of that together, what we _want_ to do is this: + // + // [#1] CMP r1, r0 // set C if num >= den + // [#2] MOVLO r1, r1, lsl #1 // if num < den, shift num left + // [#3] ADD r2, r2, #0x7D0000 // rebias exponent + // [#4] ADC r2, r2, r2, asr #16 // combine sign + exp + adjustment + // + // However, we only do the first of those four instructions right here. The + // other three are distributed through the code below, after unrelated load + // or multiply instructions which will have a result delay slot on simple + // CPUs. Each is labelled "exponent setup [#n]" in a comment. + // + // (Since instruction #4 depends on the flags set up by #2, we must avoid + // clobbering the flags in _any_ of the instructions interleaved with this!) + cmp r1, r0 // exponent setup [#1] + + // Start the mantissa division by making an approximation to the reciprocal + // of the denominator. We first obtain an 8-bit approximation using a table + // lookup indexed by the top 7 denominator bits (counting the leading 1, so + // really there are only 6 bits in the table index). + // + // (r0 >> 17) is the table index, and its top bit is always set, so it ranges + // from 64 to 127 inclusive. So we point the base register 64 bytes before + // the actual table. + adr r12, LOCAL_LABEL(tab) - 64 +#if __thumb__ + // Thumb can't do this particular shift+add+load in one instruction - it only + // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must + // calculate the load offset separately. + add r14, r12, r0, lsr #17 + ldrb r14, [r14] +#else + ldrb r14, [r12, r0, lsr #17] +#endif + + // Now do an iteration of Newton-Raphson to improve that 8-bit approximation + // to have 15-16 accurate bits. + // + // Basics of Newton-Raphson for finding a reciprocal: if you want to find 1/d + // and you have some approximation x, your next approximation is X = x(2-dx). + // Looked at one way, this is the result of applying the N-R formula + // X=x-f(x)/f'(x) to the function f(x) = 1/x - d. Another way to look at it + // is to suppose that dx = 1 - e, for some e which is small (because dx is + // already reasonably close to 1). Then you want to double the number of + // correct bits in the next approximation, i.e. square the error. So you want + // dX = 1-e^2 = (1-e)(1+e) = dx(2-dx). Cancelling d gives X = x(2-dx) again. + // + // In this situation, we're working in fixed-point integers rather than real + // numbers, and all the scales are different: + // * our input denominator d is in the range [2^23,2^24) + // * our input approximation x is in the range [2^7,2^8) + // * we want the output approximation to be in the range [2^15,2^16) + // Those factors combine to mean that we want + // x(2^32-dx) / 2^23 + // = (2^9 x) - (dx^2 / 2^23) + // + // But we also want to compute this using ordinary MUL, not a long multiply + // instruction (those are slower). So we need to worry about the product + // overflowing. dx fits in 32 bits, because it's the product of something + // <2^24 with something <2^8; but we must shift it right before multiplying + // by x again. + + mul r12, r0, r14 // r12 = dx + movlo r1, r1, lsl #1 // exponent setup [#2] in the MUL delay slot + mvn r12, r12, lsr #8 // r12 ~= -dx/2^8 + mul r3, r12, r14 // r3 ~= -dx^2/2^8 + mov r14, r14, lsl #9 // r14 = 2^9 x + add r14, r14, r3, asr #15 // r14 ~= 2^9 x - dx^2 / 2^23 + + // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa, + // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16 + // in principle, and the max mantissa 2^24-1 would have reciprocal just over + // 2^15). The error is always negative (r14 is an underestimate of the true + // value), and the maximum error is 6 and a bit ULP (that is, the true + // reciprocal is strictly less than (r14+7)). Also, r14 is always strictly + // less than 0x10000 (even in the case of the min mantissa, where the true + // value would be _exactly_ 0x10000), which eliminates a case of integer + // overflow. + // + // All of these properties of the reciprocal approximation are checked by + // exhaustively iterating over all 2^23 possible input mantissas. (The nice + // thing about doing this in single rather than double precision!) + // + // Now we extract most of the quotient by two steps of long division, using + // the reciprocal estimate to identify a multiple of the denominator to + // subtract from the numerator. To avoid integer overflow, the numerator + // mantissa is shifted down 8 bits so that it's less than 0x10000. After we + // calculate an approximate quotient, we shift the numerator left and + // subtract that multiple of the denominator, moving the next portion of the + // numerator into range for the next iteration. + + // First iteration of long division. We shift the numerator left 11 bits, and + // since the quotient approximation is scaled by 2^31, we must shift that + // right by 20 to make the right product to subtract from the numerator. + mov r12, r1, lsr #8 // shift the numerator down + mul r12, r14, r12 // make the quotient approximation + mov r1, r1, lsl #11 // shift numerator left, ready for subtraction + mov r3, r12, lsr #20 // make first 12-bit block of quotient bits + mls r1, r0, r3, r1 // subtract that multiple of den from num + + add r2, r2, #0x7D0000 // exponent setup [#3] in the MLS delay slot + + // Second iteration of long division. Differences from the first step: this + // time we shift the numerator 12 bits instead of 11, so that the total of + // both steps is 23 bits, i.e. we've shifted up by exactly the full width of + // the output mantissa. Also, the block of output quotient bits is left in a + // different register: it was in r3 the first time, and this time it's in + // r12, so that we still have both available at the end of the process. + mov r12, r1, lsr #8 // shift the numerator down + mul r12, r14, r12 // make the quotient approximation + mov r1, r1, lsl #12 // shift numerator left, ready for subtraction + mov r12, r12, lsr #19 // make second 11-bit block of quotient + mls r1, r0, r12, r1 // subtract that multiple of den from num + + adc r2, r2, r2, asr #16 // exponent setup [#4] in the MLS delay slot + + // Now r1 contains the original numerator, shifted left 23, minus _some_ + // multiple of the original denominator (which is still in r0). The bounds on + // the error in the above steps should make the error at most 1: that is, we + // may have to subtract the denominator one more time to make r1 < r0, and + // increment the quotient by one more. + // + // Our quotient is still in two pieces, computed separately in the above long + // division steps. We fold the final increment into the same instruction that + // recombines them, by doing the comparison in such a way that it sets the + // carry flag if the increment is needed. + + cmp r1, r0 // Set carry flag if num >= den + subhs r1, r1, r0 // If so, subtract den from num + adc r3, r12, r3, lsl #12 // Recombine quotient halves, plus optional +1 + + // We've finished with r14 as a temporary register, so we can unstack it now. + pop {r14} + + // Now r3 contains the _rounded-down_ output quotient, and r1 contains the + // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and + // 0 <= r1 < denominator. + // + // Next we must round to nearest, by checking if r1 is greater than half the + // denominator. In division, it's not possible to hit an exact round-to-even + // halfway case, so we don't need to spend any time checking for it. + // + // Proof of no round-to-even: define the 'width' of a dyadic rational to be + // the distance between the lowest and highest 1 bits in its binary + // representation, or equivalently, the index of its high bit if you scale it + // by a power of 2 to make it an odd integer. E.g. any actual power of 2 has + // width 0, and all of 0b11110, 0b1111, 0b11.11 and 0b0.01111 have width 3. + // Then for any dyadic rationals a,b, width(ab) >= width(a)+width(b). Let w + // be the maximum width that the input precision supports (so that for single + // precision, w=23). Then if some division n/d were a round-to-even case, the + // true quotient q=n/d would have width exactly w+1. But we have qd=n, so + // width(n) >= width(q)+width(d) > w, which can't happen, because n is in the + // input precision, hence had width <= w.) + // + // So we don't need to check for an exact _halfway_ case and clear the low + // bit of the quotient after rounding up, as addition and multiplication both + // need to do. But we do need to remember if the quotient itself was exact, + // that is, if there was no remainder at all. That's needed in underflow + // handling. + + // The rounding check wants to compare remainder with denominator/2. But of + // course in integers it's easier to compare 2*remainder with denominator. So + // we start by shifting the remainder left by 1, and in the process, set Z if + // it's exactly 0 (i.e. the result needs no rounding at all). + lsls r1, r1, #1 + // Now trial-subtract the denominator. We don't do this at all if the result + // was exact. If we do do it, r1 goes negative precisely if we need to round + // up, which sets the C flag. (The previous instruction will have left C + // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if + // we're rounding up.) + subsne r1, r1, r0 + // Recombine the quotient with the sign + exponent, and use the C flag from + // the previous instruction to increment the quotient if we're rounding up. + adc r0, r3, r2, lsl #23 + + // If we haven't either overflowed or underflowed, we're done. We can + // identify most of the safe cases by doing an unsigned comparison of the + // initial output exponent (in the top half of r2) with 0xFC: if 0 <= r2 < + // 0xFC0000 then we have neither underflow nor overflow. + // + // Rationale: the value in the top half of r2 had three chances to be + // incremented before becoming the exponent field of the actual output float. + // It was incremented if we found the numerator mantissa was >= the + // denominator (producing the value in the _bottom_ half of r2, which we just + // ADCed into the output). Then it gets unconditionally incremented again + // when the ADC combines it with the leading mantissa bit. And finally, + // round-up might increment it a third time. So 0xFC is the smallest value + // that can possibly turn into the overflowed value 0xFF after all those + // increments. + // + // On the underflow side, (top half of r2) = 0 corresponds to a value of 1 in + // the final result's exponent field (and then rounding might increase it + // further); if the exponent was less than that then r2 wraps round and looks + // like a very large positive integer from the point of view of this unsigned + // comparison. + cmp r2, #0xFC0000 + bxlo lr + + // The same comparison will have set the N and V flags to reflect the result + // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably + // distinguishes potential underflow (r2 is negative) from potential overflow + // (r2 is positive and at least 0xFC0000) + bge LOCAL_LABEL(overflow) + + // Here we might or might not have underflow (but we know we don't have + // overflow). To check more carefully, we look at the _bottom_ half of r2, + // which contains the exponent after the first adjustment (for num >= denom), + // That is, it's still off by 1 (compensating for the leading quotient bit), + // and is also before rounding. + // + // We neglect the effect of rounding: division results that are tiny (less + // than the smallest normalised number) before rounding, but then round up to + // the smallest normal number, are an acceptable edge case to handle slowly. + // We pass those to funder without worrying about them. + // + // So we want to check whether the bottom half of r2 was negative. It would + // be nice to check bits 8-15 of it, but unfortunately, it's already been + // combined with the sign (at bit 8), so those bits don't tell us anything + // useful. Instead we look at the top 4 bits of the exponent field, i.e. the + // 0xF0 bits. The largest _non_-overflowing exponent that might reach here is + // less than 3, so it doesn't reach those bits; the smallest possible + // underflow, obtained by dividing the smallest denormal by the largest + // finite number, is -151 (before the leading bit increments it), which will + // set the low 8 bits of r2 to 0x69. That is, the 0xF0 nibble of r2 will be + // 0x60 or greater for a (pre-rounding) underflow, and zero for a + // non-underflow. + + tst r2, #0xF0 + bxeq lr // no underflow after all; return + + // Rebias the exponent for funder, which also corrects the sign bit. + add r0, r0, #192 << 23 + // Tell funder whether the true value is greater or less than the number in + // r0. This is obtained from the sign of the remainder (still in r1), with + // the only problem being that it's currently reversed. So negate r1 (leaving + // 0 at 0 to indicate exactness). + rsbs r1, r1, #0 + b SYMBOL_NAME(__compiler_rt_funder) + +LOCAL_LABEL(overflow): + // Here we might or might not have overflow (but we know we don't have + // underflow). We must check whether we really have overflowed. + // + // For this it's easiest to check the exponent field in the actual output + // value in r0, after _all_ the adjustments have been completed. The largest + // overflowed exponent is 0x193, and the smallest exponent that can reach + // this is 0xFD (we checked against 0xFC above, but then the leading quotient + // bit incremented it). So it's enough to shift the output left by one + // (moving the exponent field to the top), increment it once more (so that + // the smallest overflowed exponent 0xFF wraps round to 0), and then compare + // against 0xFE000000 as an unsigned integer. + mov r12, r0, lsl #1 + add r12, r12, #1 << 24 + cmp r12, #0xFE << 24 // Check for exp = 253 or 254 + bxhs lr + // We have actual overflow. Rebias r0 to bring the exponent back into range, + // which ensures its sign is correct. Then make an infinity of that sign to + // return. + subs r0, r0, #0xC0 << 23 + movs r12, #0xFF // exponent of infinity + orrs r12, r12, r0, lsr #23 // exponent and sign at bottom of r12 + movs r0, r12, lsl #23 // shift it up to the top of r0 to return + bx lr + +LOCAL_LABEL(uncommon): + // We come here from the start of the function if either input is an uncommon + // value: zero, denormal, infinity or NaN. + // + // We arrive here with r12 = 0xFF000000, and r2 containing the exponent of x + // in bits 16..23. But r3 doesn't necessarily contain the exponent of y, + // because the instruction that set it up was conditional. So first we + // unconditionally repeat it. + and r3, r12, r1, lsr #7 + + // In all cases not involving a NaN as output, the sign of the output is made + // in the same way as for finite numbers, as the XOR of the input signs. So + // repeat the sign setup from the main branch. + teq r0, r1 // is the output sign bit 1? + orrmi r2, r2, #0x100 // if so, set bit 8 of r2 + + // Detect infinities and NaNs, by checking if either of r2 or r3 is at least + // 0xFF0000. + cmp r2, #0xFF0000 + cmplo r3, #0xFF0000 + bhs LOCAL_LABEL(inf_NaN) + + // Now we know there are no infinities or NaNs, but there's at least one zero + // or denormal. + movs r12, r1, lsl #1 // is y zero? + beq LOCAL_LABEL(divbyzero) // if so, go and handle division by zero + movs r12, r0, lsl #1 // is x zero? (now we know that y is not) + moveq r0, r2, lsl #23 // if so, 0/nonzero is just 0 (of right sign) + bxeq lr + + // Now we've eliminated zeroes as well, leaving only denormals: either x or + // y, or both, is a denormal. Call fnorm2 to convert both into a normalised + // mantissa and a (potentially small) exponent. + and r12, r2, #0x100 // save the result sign from r2 + lsr r2, #16 // shift extracted exponents down to bit 0 + lsr r3, #16 // where fnorm2 will expect them + push {r0, r1, r2, r3, r12, lr} + mov r0, sp // tell fnorm2 where to find its data + bl SYMBOL_NAME(__compiler_rt_fnorm2) + pop {r0, r1, r2, r3, r12, lr} + lsl r3, #16 // shift exponents back up to bit 16 + orr r2, r12, r2, lsl #16 // and put the result sign back in r2 + + // Now rejoin the main code path, having finished the setup it will expect: + // swap x and y, and shift the fractions back down to the low 24 bits. + mov r12, r0, lsr #8 + mov r0, r1, lsr #8 + mov r1, r12 + b LOCAL_LABEL(div) + +LOCAL_LABEL(inf_NaN): + // We come here if at least one input is a NaN or infinity. If either or both + // inputs are NaN then we hand off to fnan2 to propagate a NaN from the + // input. + mov r12, #0xFF000000 + cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN + blo SYMBOL_NAME(__compiler_rt_fnan2) + cmp r12, r1, lsl #1 + blo SYMBOL_NAME(__compiler_rt_fnan2) + + // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and + // finite/inf = 0. + + // If both operands are infinity, we return a NaN. Since we know at + // least _one_ is infinity, we can test this by checking if they're + // equal apart from the sign bits. + eor r3, r0, r1 + lsls r3, #1 // were all bits of XOR zero other than top? + beq LOCAL_LABEL(invalid) // if so, both operands are infinity + + // See if x is infinite + cmp r12, r0, lsl #1 // (r0 << 1) == 0xFF000000? + beq LOCAL_LABEL(infret) // if so, infinity/finite = infinity + + // y is infinite and x is not, so we return a zero of the + // combined sign. + eor r0, r0, r1 // calculate the right sign + and r0, r0, #0x80000000 // throw away everything else + bx lr + +LOCAL_LABEL(divbyzero): + // Here, we know y is zero. But we don't know if x is zero or nonzero. So we + // might be calculating 0/0 (invalid operation, generating a NaN), or + // nonzero/0 (the IEEE "division by zero" exception, generating infinity). + movs r12, r0, lsl #1 // is x zero too? + beq LOCAL_LABEL(invalid) // if so, go and return a NaN + +LOCAL_LABEL(infret): + // Here, we're either dividing infinity by a finite number, or dividing a + // nonzero number by 0. (Or both, if we're dividing infinity by 0.) In all + // these cases we return infinity with the sign from r2. + // + // If we were implementing IEEE exceptions, we'd have to separate these + // cases: infinity / finite is not an _exception_, it just returns infinity, + // whereas (finite and nonzero) / 0 is a division-by-zero exception. But here + // we're not implementing exceptions, so we can treat all three cases the + // same. + // + // r2 contains the output sign in bit 8, which is a convenient place to find + // it when making an infinity, because we can fill in the 8 exponent bits + // below that and then shift it left. + orr r2, r2, #0xff // sign + maximum exponent + lsl r0, r2, #23 // shift up to the top + bx lr + +LOCAL_LABEL(invalid): + // Return the default NaN, from an invalid operation (either dividing + // infinity by infinity, or 0 by 0). + ldr r0, =0x7FC00000 + bx lr + +// Finally, the lookup table for the initial reciprocal approximation. +// +// The table index is made from the top 7 bits of the denominator mantissa. But +// the topmost bit is always 1, so only the other 6 bits vary. So it only has +// 64 entries, not 128. +// +// Each table entry is a single byte, with its top bit set. So the table +// entries correspond to the reciprocal of a 7-bit mantissa prefix scaled up by +// 2^14, or the reciprocal of a whole 24-bit mantissa scaled up by 2^31. +// +// Each of these 64 entries corresponds to a large interval of possible +// mantissas. For example, if the top 7 bits are 1000001 then the overall +// mantissa could be anything from 0x820000 to 0x83FFFF. And because the output +// of this table provides more bits than the input, there are several choices +// of 8-bit reciprocal approximation for a number in that interval. The +// reciprocal of 0x820000 starts with 0xFC plus a fraction, and the reciprocal +// of 0x83FFFF starts with 0xF9 minus a fraction, so there are four reasonable +// choices for that table entry: F9, FA, FB or FC. Which do we pick? +// +// The table below is generated by choosing whichever value minimises the +// maximum possible error _after_ the approximation is improved by the +// Newton-Raphson step. In the example above, we end up with FA. +// +// The Python code below will regenerate the table, complete with the per-entry +// comments. + +/* + +for prefix in range(64, 128): + best = None + + # Max and min 23-bit mantissas with this 7-bit prefix + mmin, mmax = prefix * 2**17, (prefix + 1) * 2**17 - 1 + + # Max and min table entry corresponding to the reciprocal of something in + # that range of mantissas: round up the reciprocal of mmax, and round down + # the reciprocal of mmin. Also clamp to the range [0x80,0xff], because + # 0x100 can't be used as a table entry due to not fitting in a byte, even + # though it's the exact reciprocal of the overall-smallest mantissa + # 0x800000. + gmin = max(128, (2**31 + mmin - 1) // mmax) + gmax = min(255, 2**31 // mmin) + + # For each of those table entries, compute the result of starting from that + # value and doing a Newton-Raphson iteration, with the mantissa at each end + # of the mantissa interval. One of these will be the worst possible error. + # Choose the table entry whose worst error is as small as possible. + # + # (To find the extreme values of a more general function on an interval, + # you must consider its values not only at the interval endpoints but also + # any turning points within the interval. Here, the function has only one + # turning point, and by construction it takes value 0 there, so we needn't + # worry.) + g = max( + range(gmin, gmax + 1), + key=lambda g: min( + (g * (2**32 - d * g) / 2**23 - 2**39 / d) for d in [mmin, mmax] + ), + ) + + print(f" .byte 0x{g:02x} // input [0x{mmin:06x},0x{mmax:06x}]" + f", candidate outputs [0x{gmin:02x},0x{gmax:02x}]" + ) + +*/ + + .p2align 2 // make sure we start on a 4-byte boundary, even in Thumb +LOCAL_LABEL(tab): + .byte 0xfe // input [0x800000,0x81ffff], candidate outputs [0xfd,0xff] + .byte 0xfa // input [0x820000,0x83ffff], candidate outputs [0xf9,0xfc] + .byte 0xf6 // input [0x840000,0x85ffff], candidate outputs [0xf5,0xf8] + .byte 0xf3 // input [0x860000,0x87ffff], candidate outputs [0xf1,0xf4] + .byte 0xef // input [0x880000,0x89ffff], candidate outputs [0xee,0xf0] + .byte 0xec // input [0x8a0000,0x8bffff], candidate outputs [0xeb,0xed] + .byte 0xe8 // input [0x8c0000,0x8dffff], candidate outputs [0xe7,0xea] + .byte 0xe5 // input [0x8e0000,0x8fffff], candidate outputs [0xe4,0xe6] + .byte 0xe2 // input [0x900000,0x91ffff], candidate outputs [0xe1,0xe3] + .byte 0xdf // input [0x920000,0x93ffff], candidate outputs [0xde,0xe0] + .byte 0xdc // input [0x940000,0x95ffff], candidate outputs [0xdb,0xdd] + .byte 0xd9 // input [0x960000,0x97ffff], candidate outputs [0xd8,0xda] + .byte 0xd6 // input [0x980000,0x99ffff], candidate outputs [0xd5,0xd7] + .byte 0xd3 // input [0x9a0000,0x9bffff], candidate outputs [0xd3,0xd4] + .byte 0xd1 // input [0x9c0000,0x9dffff], candidate outputs [0xd0,0xd2] + .byte 0xce // input [0x9e0000,0x9fffff], candidate outputs [0xcd,0xcf] + .byte 0xcc // input [0xa00000,0xa1ffff], candidate outputs [0xcb,0xcc] + .byte 0xc9 // input [0xa20000,0xa3ffff], candidate outputs [0xc8,0xca] + .byte 0xc7 // input [0xa40000,0xa5ffff], candidate outputs [0xc6,0xc7] + .byte 0xc4 // input [0xa60000,0xa7ffff], candidate outputs [0xc4,0xc5] + .byte 0xc2 // input [0xa80000,0xa9ffff], candidate outputs [0xc1,0xc3] + .byte 0xc0 // input [0xaa0000,0xabffff], candidate outputs [0xbf,0xc0] + .byte 0xbd // input [0xac0000,0xadffff], candidate outputs [0xbd,0xbe] + .byte 0xbb // input [0xae0000,0xafffff], candidate outputs [0xbb,0xbc] + .byte 0xb9 // input [0xb00000,0xb1ffff], candidate outputs [0xb9,0xba] + .byte 0xb7 // input [0xb20000,0xb3ffff], candidate outputs [0xb7,0xb8] + .byte 0xb5 // input [0xb40000,0xb5ffff], candidate outputs [0xb5,0xb6] + .byte 0xb3 // input [0xb60000,0xb7ffff], candidate outputs [0xb3,0xb4] + .byte 0xb1 // input [0xb80000,0xb9ffff], candidate outputs [0xb1,0xb2] + .byte 0xaf // input [0xba0000,0xbbffff], candidate outputs [0xaf,0xb0] + .byte 0xad // input [0xbc0000,0xbdffff], candidate outputs [0xad,0xae] + .byte 0xac // input [0xbe0000,0xbfffff], candidate outputs [0xab,0xac] + .byte 0xaa // input [0xc00000,0xc1ffff], candidate outputs [0xa9,0xaa] + .byte 0xa8 // input [0xc20000,0xc3ffff], candidate outputs [0xa8,0xa8] + .byte 0xa6 // input [0xc40000,0xc5ffff], candidate outputs [0xa6,0xa7] + .byte 0xa5 // input [0xc60000,0xc7ffff], candidate outputs [0xa4,0xa5] + .byte 0xa3 // input [0xc80000,0xc9ffff], candidate outputs [0xa3,0xa3] + .byte 0xa1 // input [0xca0000,0xcbffff], candidate outputs [0xa1,0xa2] + .byte 0xa0 // input [0xcc0000,0xcdffff], candidate outputs [0xa0,0xa0] + .byte 0x9e // input [0xce0000,0xcfffff], candidate outputs [0x9e,0x9f] + .byte 0x9d // input [0xd00000,0xd1ffff], candidate outputs [0x9d,0x9d] + .byte 0x9b // input [0xd20000,0xd3ffff], candidate outputs [0x9b,0x9c] + .byte 0x9a // input [0xd40000,0xd5ffff], candidate outputs [0x9a,0x9a] + .byte 0x98 // input [0xd60000,0xd7ffff], candidate outputs [0x98,0x99] + .byte 0x97 // input [0xd80000,0xd9ffff], candidate outputs [0x97,0x97] + .byte 0x96 // input [0xda0000,0xdbffff], candidate outputs [0x95,0x96] + .byte 0x94 // input [0xdc0000,0xddffff], candidate outputs [0x94,0x94] + .byte 0x93 // input [0xde0000,0xdfffff], candidate outputs [0x93,0x93] + .byte 0x92 // input [0xe00000,0xe1ffff], candidate outputs [0x91,0x92] + .byte 0x90 // input [0xe20000,0xe3ffff], candidate outputs [0x90,0x90] + .byte 0x8f // input [0xe40000,0xe5ffff], candidate outputs [0x8f,0x8f] + .byte 0x8e // input [0xe60000,0xe7ffff], candidate outputs [0x8e,0x8e] + .byte 0x8d // input [0xe80000,0xe9ffff], candidate outputs [0x8d,0x8d] + .byte 0x8b // input [0xea0000,0xebffff], candidate outputs [0x8b,0x8c] + .byte 0x8a // input [0xec0000,0xedffff], candidate outputs [0x8a,0x8a] + .byte 0x89 // input [0xee0000,0xefffff], candidate outputs [0x89,0x89] + .byte 0x88 // input [0xf00000,0xf1ffff], candidate outputs [0x88,0x88] + .byte 0x87 // input [0xf20000,0xf3ffff], candidate outputs [0x87,0x87] + .byte 0x86 // input [0xf40000,0xf5ffff], candidate outputs [0x86,0x86] + .byte 0x85 // input [0xf60000,0xf7ffff], candidate outputs [0x85,0x85] + .byte 0x84 // input [0xf80000,0xf9ffff], candidate outputs [0x84,0x84] + .byte 0x83 // input [0xfa0000,0xfbffff], candidate outputs [0x83,0x83] + .byte 0x82 // input [0xfc0000,0xfdffff], candidate outputs [0x82,0x82] + .byte 0x81 // input [0xfe0000,0xffffff], candidate outputs [0x80,0x81] + +END_COMPILERRT_FUNCTION(__divsf3) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c new file mode 100644 index 0000000000000..06bbd4339f171 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/fnan2.c @@ -0,0 +1,42 @@ +//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations to handle propagating NaNs from the input +// operands to the output, in a way that matches Arm hardware FP. +// +// On input, a and b are floating-point numbers in IEEE 754 encoding, and at +// least one of them must be a NaN. The return value is the correct output NaN. +// +// A signalling NaN in the input (with bit 22 clear) takes priority over any +// quiet NaN, and is adjusted on return by setting bit 22 to make it quiet. If +// both inputs are the same type of NaN then the first input takes priority: +// the input a is used instead of b. +// +//===----------------------------------------------------------------------===// + +#include + +uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) { + // Make shifted-left copies of a and b to discard the sign bit. Then add 1 at + // the bit position where the quiet vs signalling bit ended up. This squashes + // all the signalling NaNs to the top of the range of 32-bit values, from + // 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values + // wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect + // a signalling NaN by asking if it's greater than 0xff800000, and a quiet + // one by asking if it's less than 0x00800000. + uint32_t aadj = (a << 1) + 0x00800000; + uint32_t badj = (b << 1) + 0x00800000; + if (aadj > 0xff800000) // a is a signalling NaN? + return a | 0x00400000; // if so, return it with the quiet bit set + if (badj > 0xff800000) // b is a signalling NaN? + return b | 0x00400000; // if so, return it with the quiet bit set + if (aadj < 0x00800000) // a is a quiet NaN? + return a; // if so, return it + return b; // otherwise we expect b must be a quiet NaN +} diff --git a/compiler-rt/lib/builtins/arm/fnorm2.c b/compiler-rt/lib/builtins/arm/fnorm2.c new file mode 100644 index 0000000000000..29eba1cbde59d --- /dev/null +++ b/compiler-rt/lib/builtins/arm/fnorm2.c @@ -0,0 +1,62 @@ +//===-- fnorm2.c - Handle single-precision denormal inputs to binary op ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations, to handle denormal inputs on entry by +// renormalizing the mantissa and modifying the exponent to match. +// +//===----------------------------------------------------------------------===// + +#include + +// Structure containing the function's inputs and outputs. +// +// On entry: a, b are two input floating-point numbers, still in IEEE 754 +// encoding. expa and expb are the 8-bit exponents of those numbers, extracted +// and shifted down to the low 8 bits of the word, with no other change. +// Neither value should be zero, or have the maximum exponent (indicating an +// infinity or NaN). +// +// On exit: each of a and b contains the mantissa of the input value, with the +// leading 1 bit made explicit, and shifted up to the top of the word. If expa +// was zero (indicating that a was denormal) then it is now represented as a +// normalized number with an out-of-range exponent (zero or negative). The same +// applies to expb and b. +struct fnorm2 { + uint32_t a, b, expa, expb; +}; + +void __compiler_rt_fnorm2(struct fnorm2 *values) { + // Shift the mantissas of a and b to the right place to follow a leading 1 in + // the top bit, if there is one. + values->a <<= 8; + values->b <<= 8; + + // Test if a is denormal. + if (values->expa == 0) { + // If so, decide how much further up to shift its mantissa, and adjust its + // exponent to match. This brings the leading 1 of the denormal mantissa to + // the top of values->a. + uint32_t shift = __builtin_clz(values->a); + values->a <<= shift; + values->expa = 1 - shift; + } else { + // Otherwise, leave the mantissa of a in its current position, and OR in + // the explicit leading 1. + values->a |= 0x80000000; + } + + // Do the same operation on b. + if (values->expb == 0) { + uint32_t shift = __builtin_clz(values->b); + values->b <<= shift; + values->expb = 1 - shift; + } else { + values->b |= 0x80000000; + } +} diff --git a/compiler-rt/lib/builtins/arm/funder.c b/compiler-rt/lib/builtins/arm/funder.c new file mode 100644 index 0000000000000..fd29e157328a3 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/funder.c @@ -0,0 +1,78 @@ +//===-- funder.c - Handle single-precision floating-point underflow -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations to handle underflowed output values, if they were +// computed in the form of a normalized mantissa and an out-of-range exponent. +// +// On input: x should be a complete IEEE 754 floating-point value representing +// the desired output scaled up by 2^192 (the same value that would have been +// passed to an underflow trap handler in IEEE 754:1985). +// +// This isn't enough information to re-round to the correct output denormal +// without also knowing whether x itself has already been rounded, and which +// way. 'errsign' gives this information, by indicating the sign of the value +// (true result - x). That is, if errsign > 0 it means the true value was +// larger (x was rounded down); if errsign < 0 then x was rounded up; if +// errsign == 0 then x represents the _exact_ desired output value. +// +//===----------------------------------------------------------------------===// + +#include + +#define SIGNBIT 0x80000000 +#define MANTSIZE 23 +#define BIAS 0xc0 + +uint32_t __compiler_rt_funder(uint32_t x, uint32_t errsign) { + uint32_t sign = x & SIGNBIT; + uint32_t exponent = (x << 1) >> 24; + + // Rule out exponents so small (or large!) that no denormalisation + // is needed. + if (exponent > BIAS) { + // Exponent 0xc1 or above means a normalised number got here by + // mistake, so we just remove the 0xc0 exponent bias and go + // straight home. + return x - (BIAS << MANTSIZE); + } + uint32_t bits_lost = BIAS + 1 - exponent; + if (bits_lost > MANTSIZE + 1) { + // The implicit leading 1 of the intermediate value's mantissa is + // below the lowest mantissa bit of a denormal by at least 2 bits. + // Round down to 0 unconditionally. + return sign; + } + + // Make the full mantissa (with leading bit) at the top of the word. + uint32_t mantissa = 0x80000000 | (x << 8); + // Adjust by 1 depending on the sign of the error. + mantissa -= errsign >> 31; + mantissa += (-errsign) >> 31; + + // Shift down to the output position, keeping the bits shifted off. + uint32_t outmant, shifted_off; + if (bits_lost == MANTSIZE + 1) { + // Special case for the exponent where we have to shift the whole + // of 'mantissa' off the bottom of the word. + outmant = 0; + shifted_off = mantissa; + } else { + outmant = mantissa >> (8 + bits_lost); + shifted_off = mantissa << (32 - (8 + bits_lost)); + } + + // Re-round. + if (shifted_off >> 31) { + outmant++; + if (!(shifted_off << 1)) + outmant &= ~1; // halfway case: round to even + } + + return sign | outmant; +} diff --git a/compiler-rt/lib/builtins/arm/mulsf3.S b/compiler-rt/lib/builtins/arm/mulsf3.S new file mode 100644 index 0000000000000..b4f4c5e958c52 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/mulsf3.S @@ -0,0 +1,309 @@ +//===-- mulsf3.S - single-precision floating point multiplication ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements single-precision soft-float multiplication with the +// IEEE-754 default rounding (to nearest, ties to even), in optimized AArch32 +// assembly language suitable to be built as either Arm or Thumb2. +// +//===----------------------------------------------------------------------===// + +#include "../assembly.h" + + + .syntax unified + .text + .p2align 2 + +DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3) + +DEFINE_COMPILERRT_FUNCTION(__mulsf3) + + // Check if either input exponent is 00 or FF (i.e. not a normalized number), + // and if so, branch out of line. If we don't branch out of line, then we've + // also extracted the exponents of the input values r0/r1 into bits 16..23 of + // r2/r3. But if we do, then that hasn't necessarily been done (because the + // second AND might have been skipped). + mov r12, #0xFF0000 + ands r2, r12, r0, lsr #7 // sets Z if exponent of x is 0 + andsne r3, r12, r1, lsr #7 // otherwise, sets Z if exponent of y is 0 + teqne r2, r12 // otherwise, sets Z if exponent of x is FF + teqne r3, r12 // otherwise, sets Z if exponent of y is FF + beq LOCAL_LABEL(uncommon) // branch out of line to handle inf/NaN/0/denorm + + // Calculate the sign of the result, and put it in an unused bit of r2. + teq r0, r1 // sets N to the XOR of x and y's sign bits + orrmi r2, r2, #0x100 // if N set, set bit 8 of r2 + + // Move the input mantissas to the high end of r0/r1, each with its leading + // bit set explicitly, so that they're in the right form to be multiplied. + mov r12, #0x80000000 + orr r0, r12, r0, lsl #8 + orr r1, r12, r1, lsl #8 + + // Now we're ready to multiply mantissas. This is also the place we'll come + // back to after decoding denormal inputs. The denormal decoding will also + // have to set up the same register contents: + // - decoded fractions at the top of r0 and r1 + // - exponents in r2 and r3, starting at bit 16 + // - output sign in r2 bit 8 +LOCAL_LABEL(mul): + + // Here we multiply the mantissas, and compute the output exponent by adding + // the input exponents and rebiasing. These operations are interleaved to + // use a delay slot. + // + // The exponent is rebiased by subtracting 0x80, rather than the 0x7F you'd + // expect. That compensates for the leading bit of the mantissa overlapping + // it, when we recombine the exponent and mantissa by addition. + add r2, r2, r3 // r2 has sum of exponents, freeing up r3 + umull r1, r3, r0, r1 // r3:r1 has the double-width product + sub r2, r2, #(0x80 << 16) // rebias the summed exponent + + // Compress the double-word product into just the high-order word r3, by + // setting its bit 0 if any bit of the low-order word is nonzero. This + // changes the represented value, but not by nearly enough to affect + // rounding, because rounding only depends on the bit below the last output + // bit, and the general question of whether _any_ nonzero bit exists below + // that. + cmp r1, #0 // if low word of full product is nonzero + orrne r3, r3, #1 // then set LSB of high word + + // The two inputs to UMULL had their high bits set, that is, were at least + // 0x80000000. So the 64-bit product was at least 0x4000000000000000, i.e. + // the high bit of the product could be at the top of the word or one bit + // below. Check which, by experimentally shifting left, and then undoing it + // via RRX if we turned out to have shifted off a 1 bit. + lsls r3, r3, #1 // shift left, setting C to the bit shifted off + rrxcs r3, r3 // if that bit was 1, put it back again + + // That ensured the leading 1 bit of the product is now the top of r3, but + // also, set C if the leading 1 was _already_ in the top bit. So now we know + // whether to increment the exponent. The following instruction does the + // conditional increment (because it's ADC), but also, copies the exponent + // field from bit 16 of r2 into bit 0, so as to place it just below the + // output sign bit. + // + // So, if the number hasn't overflowed or underflowed, the low 9 bits of r2 + // are exactly what we need to combine with the rounded mantissa. But the + // full output exponent (with extra bits) is still available in the high half + // of r2, so that we can check _whether_ we overflowed or underflowed. + adc r2, r2, r2, asr #16 + + // Recombine the exponent and mantissa, doing most of the rounding as a side + // effect: we shift the mantissa right so as to put the round bit into C, and + // then we recombine with the exponent using ADC, to increment the mantissa + // if C was set. + movs r12, r3, lsr #8 + adc r0, r12, r2, lsl #23 + + // To complete the rounding, we must check for the round-to-even tiebreaking + // case, by checking if we're in the exact halfway case, which occurs if and + // only if we _did_ round up (we can tell this because C is still set from + // the MOVS), and also, no bit of r3 is set _below_ the round bit. + // + // We combine this with an overflow check, so that C ends up set if anything + // weird happened, and clear if we're completely finished and can return. + // + // The best instruction sequence for this part varies between Arm and Thumb. +#if !__thumb__ + // Arm state: if C was set then we check the low bits of r3, so that Z ends + // up set if we need to round to even. + // + // (We rely here on Z reliably being clear to begin with, because shifting + // down the output mantissa definitely gave a nonzero output. Also, the TST + // doesn't change C, so if Z does end up set, then C was also set.) + // + // Then, if we're not rounding to even, we do a CMP which sets C if there's + // been an overflow or an underflow. An overflow could occur for an output + // exponent as low as 0xFC, because we might increment the exponent by 1 when + // renormalizing, by another when recombining with the mantissa, and by one + // more if rounding up causes a carry off the top of the mantissa. An + // underflow occurs only if the output exponent is negative (because it's + // offset by 1, so an exponent of 0 will be incremented to 1), in which case + // the top 8 bits of r2 will all be set. Therefore, an unsigned comparison to + // see if r2 > 0xFC0000 will catch all overflow and underflow cases. It also + // catches a few very large cases that _don't_ quite overflow (exponents of + // 0xFC and above that don't get maximally unlucky); those will also be + // handled by the slow path. + tstcs r3, #0x7F + cmpne r2, #0xFC0000 +#else + // In Thumb, switching between different conditions has a higher cost due to + // the (implicit in this code) IT instructions, so we prefer a strategy that + // uses CC and CS conditions throughout, at the cost of requiring some extra + // cleanup instructions on the slow path. + // + // If C is set (and hence round-to-even is a possibility), the basic idea is + // to shift the full result word (r3) left by 25, leaving only its bottom 7 + // bits, which are now the top 7 bits; then we want to set C iff these are 0. + // + // The "CMP x,y" instruction sets C if y > x (as unsigned integers). So this + // could be done in one instruction if only we had a register to use as x, + // which has 0 in the top 7 bits and at least one nonzero. Then we could + // compare that against the shifted-up value of r3, setting C precisely if + // the top 7 bits of y are greater than 0. And happily, we _do_ have such a + // register! r12 contains the shifted-down mantissa, which is guaranteed to + // have a 1 in bit 23, and 0 above that. + // + // The shift of r3 happens only in the second operand of the compare, so we + // don't lose the original value of r3 in this process. + // + // The check for over/underflow is exactly as in the Arm branch above, except + // based on a different condition. + cmpcs r12, r3, lsl #25 // now C is set iff we're rounding to even + cmpcc r2, #0xFC0000 // and now it's also set if we've over/underflowed +#endif + + // That's all the checks for difficult cases done. If C is clear, we can + // return. + bxcc lr + + // Now the slower path begins. We have to recover enough information to + // handle all of round-to-even, overflow and underflow. + // + // Round to even is the most likely of these, so we detect it first and + // handle it as fast as possible. + +#if __thumb__ + // First, Thumb-specific compensation code. The Arm branch of the #if above + // will have set Z=0 to indicate round to even, but the Thumb branch didn't + // leave any unambiguous indicator of RTE, so we must retest by checking all + // the bits shifted off the bottom of the mantissa to see if they're exactly + // the half-way value. + lsl r12, r3, #24 // r12 = round bit and everything below + cmp r12, #0x80000000 // set Z if that is exactly 0x80000000 +#endif + + // Now Z is clear iff we have already rounded up and now must replace that + // with rounding to even, which is done by just clearing the low bit of the + // mantissa. + biceq r0, r0, #1 + + // Redo the over/underflow check (the same way as in both branches above), + // and if it doesn't report a danger, we can return the rounded-to-even + // answer. + cmp r2, #0xFC0000 // check for over/underflow + bxcc lr // and return if none. + + // Now we only have overflow and underflow left to handle. First, find out + // which we're looking at. This is easy by testing the top bit of r2, but + // even easier by using the fact that the possible positive and negative + // values of r2 are widely enough separated that the 0xFC0000 subtracted by + // the CMP above won't have made any difference. So the N flag output from + // that comparison _already_ tells us which condition we have: if N is set we + // have underflow, and if N is clear, overflow. + bpl LOCAL_LABEL(overflow) + + // Here we're handling underflow. + + // Add the IEEE 754:1985 exponent bias which funder will expect. This also + // brings the exponent back into a range where it can't possibly have carried + // into the sign bit, so the output sign will now be right. + add r0, r0, #(0xC0 << 23) + + // Determine whether we rounded up, down or not at all. + lsls r2, r3, #1 // input mantissa, without its leading 1 + subs r1, r2, r0, lsl #9 // subtract the output mantissa (likewise) + + // And let funder handle the rest. + b SYMBOL_NAME(__compiler_rt_funder) + +LOCAL_LABEL(overflow): + // We come here to handle overflow, but it's not guaranteed that an overflow + // has actually happened: our check on the fast path erred on the side of + // caution, by catching any output exponent that _could_ cause an overflow. + // So first check whether this really is an overflow, by extracting the + // output exponent. Exponent 0xFF, or anything that wrapped round to having + // the high bit clear, are overflows; 0xFE down to 0xFC are not overflows. + // + // The value in r0 is correct to return, if there's no overflow. + add r12, r0, #(1 << 23) // add 1 to the exponent so 0xFF wraps to 0 + movs r12, r12, lsl #1 // test the top bit of the modified value + bxmi lr // if top bit is still 1, not an overflow + + // This is an overflow, so we need to replace it with an appropriately signed + // infinity. First we correct the sign by applying a downward bias to the + // exponent (the one suggested in IEEE 754:1985, which was chosen to bring + // all possible overflowed results back into range). + subs r0, r0, #(0xC0 << 23) + + // Now the sign bit of r0 is correct. Replace everything else with the + // encoding of an infinity. + mov r1, #0xFF + and r0, r0, #0x80000000 + orr r0, r0, r1, lsl #23 + bx lr + +LOCAL_LABEL(uncommon): + // Handle zeros, denorms, infinities and NaNs. We arrive here knowing that + // we've at least done the first _two_ instructions from the entry point, + // even if all the rest were skipped. So r2 contains the sign and exponent of + // x in bits 16..23, and r12 = 0xFF << 16. + // + // So, first repeat some instructions from the prologue, which were either + // conditionally skipped in the sequence leading to the branch, or skipped + // because they happened after the branch. + and r3, r12, r1, lsr #7 // get exponent of y in r3 bits 16..23 + teq r0, r1 // calculate the sign of the result + orrmi r2, r2, #0x100 // and put it in bit 8 of r2 as before + + // Check for infinities and NaNs, by testing each of r2,r3 to see if it's at + // least 0xFF0000 (hence the exponent field is equal to 0xFF). + cmp r2, r12 + cmplo r3, r12 + bhs LOCAL_LABEL(inf_NaN) + + // If we didn't take that branch, then we have only finite numbers, but at + // least one is denormal or zero. A zero makes the result easy (and also is a + // more likely input than a denormal), so check those first, as fast as + // possible. + movs r12, r0, lsl #1 // Z set if x == 0 + movsne r12, r1, lsl #1 // now Z set if either input is 0 + moveq r0, r2, lsl #23 // in either case, make 0 of the output sign + bxeq lr // and return it + + // Now we know we only have denormals to deal with. Call fnorm2 to sort + // them out, and rejoin the main code path above. + and r12, r2, #0x100 // save the result sign from r2 + lsr r2, #16 // shift extracted exponents down to bit 0 + lsr r3, #16 // where fnorm2 will expect them + push {r0, r1, r2, r3, r12, lr} + mov r0, sp // tell fnorm2 where to find its data + bl SYMBOL_NAME(__compiler_rt_fnorm2) + pop {r0, r1, r2, r3, r12, lr} + lsl r3, #16 // shift exponents back up to bit 16 + orr r2, r12, r2, lsl #16 // and put the result sign back in r2 + b LOCAL_LABEL(mul) + +LOCAL_LABEL(inf_NaN): + // We come here if at least one input is a NaN or infinity. If either or both + // inputs are NaN then we hand off to fnan2 which will propagate a NaN from + // the input; otherwise any multiplication involving infinity returns + // infinity, unless it's infinity * 0 which is an invalid operation and + // returns NaN again. + mov r12, #0xFF000000 + cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN + blo SYMBOL_NAME(__compiler_rt_fnan2) + cmp r12, r1, lsl #1 + blo SYMBOL_NAME(__compiler_rt_fnan2) + + // NaNs are dealt with, so now we have at least one infinity. Check if the + // other operand is 0. This is conveniently done by XORing the two: because + // we know that the low 31 bits of one operand are exactly 0x7F800000, we can + // test if the low 31 bits of the other one are all 0 by checking whether the + // low 31 bits of (x XOR y) equal 0x7F800000. + eor r3, r0, r1 + cmp r12, r3, lsl #1 // if inf * 0, this sets Z + lsr r0, r12, #1 // set up return value of +infinity + orrne r0, r0, r2, lsl #23 // if not inf * 0, put on the output sign + orreq r0, r0, #0x400000 // otherwise, set the 'quiet NaN' bit + bx lr // and return + +END_COMPILERRT_FUNCTION(__mulsf3) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S new file mode 100644 index 0000000000000..f2ede1013a9e6 --- /dev/null +++ b/compiler-rt/lib/builtins/arm/thumb1/mulsf3.S @@ -0,0 +1,251 @@ +//===-- mulsf3.S - single-precision floating point multiplication ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements single-precision soft-float multiplication with the +// IEEE-754 default rounding (to nearest, ties to even), in optimized Thumb1 +// assembly language. +// +//===----------------------------------------------------------------------===// + +#include "../../assembly.h" + + .syntax unified + .text + .thumb + .p2align 2 + +DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fmul, __mulsf3) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__mulsf3) + push {r4,r5,r6,lr} + + // Get exponents of the inputs, and check for uncommon values. In the process + // of this we also compute the sign, because it's marginally quicker that + // way. + lsls r2, r0, #1 + adcs r4, r4, r4 // set r4[0] to sign bit of x + lsls r3, r1, #1 + adcs r4, r4, r3 // set r4[0] to the output sign + lsrs r2, r2, #24 + beq LOCAL_LABEL(zerodenorm0) // still do the next LSRS + lsrs r3, r3, #24 + beq LOCAL_LABEL(zerodenorm) + cmp r2, #255 + beq LOCAL_LABEL(naninf) + cmp r3, #255 + beq LOCAL_LABEL(naninf) + // Compute the output exponent. We'll be generating our product _without_ the + // leading bit, so we subtract 0x7f rather than 0x80. + adds r2, r2, r3 + subs r2, r2, #0x7f + // Blank off everything above the mantissas. + lsls r0, r0, #9 + lsls r1, r1, #9 +LOCAL_LABEL(normalised): // we may come back here from zerodenorm + lsrs r0, r0, #9 + lsrs r1, r1, #9 + // Multiply. r0 and r1 are the mantissas of the inputs but without their + // leading bits, so the product we want in principle is P=(r0+2^23)(r1+2^23). + // P is at most (2^24-1)^2 < 2^48, so it fits in a word and a half. + // + // The technique below will actually compute P - 2^46, by not adding on the + // term where the two 2^23 are multiplied. The 48-bit result will be + // delivered in two output registers, one containing its bottom 32 bits and + // the other containing the top 32, so they overlap in the middle 16 bits. + // This is done using only two multiply instructions and some bookkeeping. + // + // In the comments I'll write X and Y for the original input mantissas (again + // without their leading bits). I'll also decompose them as X = xh + xl and + // Y = yh + yl, where xl and yl are in the range 0..2^8-1 and xh,yh are + // multiples of 2^8. + adds r5, r0, r1 + lsls r5, r5, #7 // r5 = (X+Y) << 7 + movs r6, r0 + muls r6, r1, r6 // r6 is congruent mod 2^32 to X*Y + lsrs r0, r0, #8 + lsrs r1, r1, #8 + muls r0, r1, r0 + lsls r1, r0, #16 // r1 is congruent mod 2^32 to xh*yh + subs r3, r6, r1 // now r3 is congruent mod 2^32 to + // (X*Y) - (xh*yh) = xh*yl + xl*yh + xl*yl + // and hence, since that is at most 0xfeff0001, + // is _exactly_ equal to that + adds r0, r0, r5 // r0 is now (xh*yh + (X+Y)<<23) >> 16 + lsrs r1, r3, #16 // r1 is the top 16 bits of r3, i.e. + // (xh*yl + xl*yh + xl*yl) >> 16 + adds r3, r0, r1 // now r3 equals + // (xh*yh + xh*yl + xl*yh + xl*yl + (X+Y)<<23) >> 16 + // i.e. (X*Y + (X+Y)<<23) >> 16, + // i.e. (the right answer) >> 16. + // Meanwhile, r6 is exactly the bottom 32 bits of the + // right answer. + // Renormalise if necessary. + lsrs r1, r3, #30 + beq LOCAL_LABEL(norenorm) + // Here we have to do something fiddly. Renormalisation would be a trivial + // job if we had the leading mantissa bit - just note that it's one bit + // position above where it should be, and shift right by one. But without + // that bit, we currently have (2x - 2^30), and we want (x - 2^30); just + // shifting right would of course give us (x - 2^29), so we must subtract an + // extra 2^29 to fix this up. + lsrs r3, r3, #1 + movs r1, #1 + lsls r1, r1, #29 + subs r3, r3, r1 + adds r2, r2, #1 +LOCAL_LABEL(norenorm): + // Round and shift down to the right bit position. + lsrs r0, r3, #7 // round bit goes into the carry flag + bcc LOCAL_LABEL(rounded) + adds r0, r0, #1 + // In the round-up branch, we must also check if we have to round to even, by + // testing all the bits below the round bit. We will normally not expect to, + // so we do RTE by branching out of line and back again to avoid spending a + // branch in the common case. + lsls r5, r3, #32-7+1 // check the bits shifted out of r3 above + bne LOCAL_LABEL(rounded) // if any is nonzero, we're not rounding to even + lsls r5, r6, #15 // check the bottom 17 bits of the low-order 32 + // (enough to overlap r3 even if we renormalised) + beq LOCAL_LABEL(rte) // if any is nonzero, fall through, else RTE +LOCAL_LABEL(rounded): + // Put on the sign and exponent, check for underflow and overflow, and + // return. + // + // Underflow occurs iff r2 (the output exponent) <= 0. Overflow occurs if + // it's >= 0xFF. (Also if it's 0xFE and we rounded up to overflow, but since + // this code doesn't report exceptions, we can ignore this case because it'll + // happen to return the right answer regardless). So we handle most of this + // via an unsigned comparison against 0xFF, which leaves the one case of a + // zero exponent that we have to filter separately by testing the Z flag + // after we shift the exponent back up into place. + cmp r2, #0xFF // check for most over/underflows + bhs LOCAL_LABEL(outflow) // ... and branch out of line for them + lsls r5, r2, #23 // shift the exponent into its output location + beq LOCAL_LABEL(outflow) // ... and branch again if it was 0 + lsls r4, r4, #31 // shift the output sign into place + orrs r0, r0, r4 // and OR it in to the output + adds r0, r0, r5 // OR in the mantissa + pop {r4,r5,r6,pc} // and return + +LOCAL_LABEL(rte): + // Out-of-line handler for the round-to-even case. Clear the low mantissa bit + // and go back to the post-rounding code. + movs r5, #1 + bics r0, r0, r5 + b LOCAL_LABEL(rounded) + +LOCAL_LABEL(outflow): + cmp r2, #0 + bgt LOCAL_LABEL(overflow) + // To handle underflow, we construct an intermediate value in the IEEE 754 + // style (using our existing full-length mantissa, and bias the exponent by + // +0xC0), and indicate whether that intermediate was rounded up, down or not + // at all. Then call the helper function funder, which will denormalise and + // re-round correctly. + lsls r1, r0, #7 // shift up the post-rounding mantissa + subs r1, r3, r1 // and subtract it from the pre-rounding version + lsls r6, r6, #15 + cmp r6, #1 // if the rest of the low bits are nonzero + adcs r1, r1, r1 // then set an extra bit at the bottom + + lsls r4, r4, #31 + orrs r0, r0, r4 // put on the sign + adds r2, r2, #192 // bias the exponent + lsls r3, r2, #23 + adds r0, r0, r3 // put on the biased exponent + + bl SYMBOL_NAME(__compiler_rt_funder) + pop {r4,r5,r6,pc} + +LOCAL_LABEL(overflow): + // Handle overflow by returning an infinity of the correct sign. + lsls r4, r4, #8 // move the sign up to bit 8 + movs r0, #0xff + orrs r0, r0, r4 // fill in an exponent just below it + lsls r0, r0, #23 // and shift those 9 bits up to the top of the word + pop {r4,r5,r6,pc} + + // We come here if there's at least one zero or denormal. On the fast path + // above, it was convenient to check these before checking NaNs and + // infinities, but NaNs take precedence, so now we're off the fast path, we + // must still check for those. + // + // At the main entry point 'zerodenorm' we want r2 and r3 to be the two input + // exponents. So if we branched after shifting-and-checking r2, we come to + // this earlier entry point 'zerodenorm0' so that we still shift r3. +LOCAL_LABEL(zerodenorm0): + lsrs r3, r3, #24 +LOCAL_LABEL(zerodenorm): + cmp r2, #255 + beq LOCAL_LABEL(naninf) + cmp r3, #255 + beq LOCAL_LABEL(naninf) + // Now we know we have at least one zero or denormal, and no NaN or infinity. + // Check if either input is actually zero. We've ruled out 0 * infinity by + // this point, so any zero input means we return zero of the correct sign. + lsls r6, r0, #1 // is one input zero? + beq LOCAL_LABEL(zero) // yes, go and return zero + lsls r6, r1, #1 // is the other one zero? + bne LOCAL_LABEL(denorm) // if not, one must have been a denormal +LOCAL_LABEL(zero): + lsls r0, r4, #31 // shift up the output sign to make the return value + pop {r4,r5,r6,pc} + + // Handle denormals via the helper function fnorm2, which will break both + // inputs up into mantissa and exponent, renormalising and generating a + // negative exponent if necessary. +LOCAL_LABEL(denorm): + push {r0,r1,r2,r3} + mov r0, sp + bl SYMBOL_NAME(__compiler_rt_fnorm2) + pop {r0,r1,r2,r3} + // Convert fnorm2's return values into the right form to rejoin the main + // code path. + lsls r0, r0, #1 + lsls r1, r1, #1 + adds r2, r2, r3 + subs r2, r2, #0x7f + b LOCAL_LABEL(normalised) + + // We come here if at least one input is a NaN or infinity. There may still + // be zeroes (or denormals, though they make no difference at this stage). +LOCAL_LABEL(naninf): + movs r6, #0xff + lsls r6, r6, #24 + lsls r5, r0, #1 + cmp r5, r6 + bhi LOCAL_LABEL(nan) // first operand is a NaN + lsls r5, r1, #1 + cmp r5, r6 + bhi LOCAL_LABEL(nan) // second operand is a NaN + + // We know we have at least one infinity, and no NaNs. We might also have a + // zero, in which case we return the default quiet NaN. + lsls r6, r0, #1 + beq LOCAL_LABEL(infzero) // if r0 is a zero, r1 must be inf + lsls r6, r1, #1 + beq LOCAL_LABEL(infzero) // if r1 is a zero, r0 must be inf + // Otherwise we have infinity * infinity, or infinity * finite. Just return + // an appropriately signed infinity. + b LOCAL_LABEL(overflow) // reuse the code there + + // We come here if at least one input is a NaN. Hand off to fnan2, which + // propagates an appropriate NaN to the output, dealing with the special + // cases of signalling/quiet NaNs. +LOCAL_LABEL(nan): + bl SYMBOL_NAME(__compiler_rt_fnan2) + pop {r4,r5,r6,pc} + + // Return a quiet NaN as the result of infinity * zero. +LOCAL_LABEL(infzero): + ldr r0, =0x7fc00000 + pop {r4,r5,r6,pc} + +END_COMPILERRT_FUNCTION(__mulsf3) + +NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/test/builtins/CMakeLists.txt b/compiler-rt/test/builtins/CMakeLists.txt index 63f4c94605c90..8e3cb35183ba7 100644 --- a/compiler-rt/test/builtins/CMakeLists.txt +++ b/compiler-rt/test/builtins/CMakeLists.txt @@ -35,6 +35,10 @@ if(APPLE) darwin_filter_host_archs(BUILTIN_SUPPORTED_ARCH BUILTIN_TEST_ARCH) endif() +if(COMPILER_RT_ARM_OPTIMIZED_FP) + list(APPEND BUILTINS_TEST_TARGET_CFLAGS -DCOMPILER_RT_ARM_OPTIMIZED_FP) +endif() + foreach(arch ${BUILTIN_TEST_ARCH}) set(BUILTINS_TEST_TARGET_ARCH ${arch}) string(TOLOWER "-${arch}-${OS_NAME}" BUILTINS_TEST_CONFIG_SUFFIX) diff --git a/compiler-rt/test/builtins/Unit/divsf3_test.c b/compiler-rt/test/builtins/Unit/divsf3_test.c index f8cb6169ac283..12c5df5fdaae1 100644 --- a/compiler-rt/test/builtins/Unit/divsf3_test.c +++ b/compiler-rt/test/builtins/Unit/divsf3_test.c @@ -1,115 +1,428 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // RUN: %clang_builtins %s %librt -o %t && %run %t // REQUIRES: librt_has_divsf3 #include "int_lib.h" +#include #include #include "fp_test.h" +// By default this test uses compareResultF to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more +// detailed handling of NaNs, we tighten up the check and include some extra +// test cases specific to that NaN policy. +#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + // Returns: a / b COMPILER_RT_ABI float __divsf3(float a, float b); -int test__divsf3(float a, float b, uint32_t expected) -{ - float x = __divsf3(a, b); - int ret = compareResultF(x, expected); +int test__divsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) { + float a = fromRep32(a_rep), b = fromRep32(b_rep); + float x = __divsf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep32(x) == expected_rep; +#else + int ret = compareResultF(x, expected_rep); +#endif - if (ret){ - printf("error in test__divsf3(%.20e, %.20e) = %.20e, " - "expected %.20e\n", a, b, x, - fromRep32(expected)); - } - return ret; + if (ret) { + printf("error in test__divsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32 + ", expected %08" PRIx32 "\n", + a_rep, b_rep, toRep32(x), expected_rep); + } + return ret; } -int main() -{ - // Returned NaNs are assumed to be qNaN by default - - // qNaN / any = qNaN - if (test__divsf3(makeQNaN32(), 3.F, UINT32_C(0x7fc00000))) - return 1; - // NaN / any = NaN - if (test__divsf3(makeNaN32(UINT32_C(0x123)), 3.F, UINT32_C(0x7fc00000))) - return 1; - // any / qNaN = qNaN - if (test__divsf3(3.F, makeQNaN32(), UINT32_C(0x7fc00000))) - return 1; - // any / NaN = NaN - if (test__divsf3(3.F, makeNaN32(UINT32_C(0x123)), UINT32_C(0x7fc00000))) - return 1; - - // +Inf / positive = +Inf - if (test__divsf3(makeInf32(), 3.F, UINT32_C(0x7f800000))) - return 1; - // +Inf / negative = -Inf - if (test__divsf3(makeInf32(), -3.F, UINT32_C(0xff800000))) - return 1; - // -Inf / positive = -Inf - if (test__divsf3(makeNegativeInf32(), 3.F, UINT32_C(0xff800000))) - return 1; - // -Inf / negative = +Inf - if (test__divsf3(makeNegativeInf32(), -3.F, UINT32_C(0x7f800000))) - return 1; - - // Inf / Inf = NaN - if (test__divsf3(makeInf32(), makeInf32(), UINT32_C(0x7fc00000))) - return 1; - // 0.0 / 0.0 = NaN - if (test__divsf3(+0x0.0p+0F, +0x0.0p+0F, UINT32_C(0x7fc00000))) - return 1; - // +0.0 / +Inf = +0.0 - if (test__divsf3(+0x0.0p+0F, makeInf32(), UINT32_C(0x0))) - return 1; - // +Inf / +0.0 = +Inf - if (test__divsf3(makeInf32(), +0x0.0p+0F, UINT32_C(0x7f800000))) - return 1; - - // positive / +0.0 = +Inf - if (test__divsf3(+1.F, +0x0.0p+0F, UINT32_C(0x7f800000))) - return 1; - // positive / -0.0 = -Inf - if (test__divsf3(+1.F, -0x0.0p+0F, UINT32_C(0xff800000))) - return 1; - // negative / +0.0 = -Inf - if (test__divsf3(-1.F, +0x0.0p+0F, UINT32_C(0xff800000))) - return 1; - // negative / -0.0 = +Inf - if (test__divsf3(-1.F, -0x0.0p+0F, UINT32_C(0x7f800000))) - return 1; - - // 1/3 - if (test__divsf3(1.F, 3.F, UINT32_C(0x3eaaaaab))) - return 1; - // smallest normal result - if (test__divsf3(0x1.0p-125F, 2.F, UINT32_C(0x00800000))) - return 1; +int main(void) { + int status = 0; - // divisor is exactly 1.0 - if (test__divsf3(0x1.0p+0F, 0x1.0p+0F, UINT32_C(0x3f800000))) - return 1; - // divisor is truncated to exactly 1.0 in UQ1.15 - if (test__divsf3(0x1.0p+0F, 0x1.0001p+0F, UINT32_C(0x3f7fff00))) - return 1; + status |= test__divsf3(0x00000000, 0x00000001, 0x00000000); + status |= test__divsf3(0x00000000, 0x007fffff, 0x00000000); + status |= test__divsf3(0x00000000, 0x00800000, 0x00000000); + status |= test__divsf3(0x00000000, 0x00ffffff, 0x00000000); + status |= test__divsf3(0x00000000, 0x3f800000, 0x00000000); + status |= test__divsf3(0x00000000, 0x40a00000, 0x00000000); + status |= test__divsf3(0x00000000, 0x7effffff, 0x00000000); + status |= test__divsf3(0x00000000, 0x7f000000, 0x00000000); + status |= test__divsf3(0x00000000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x00000000, 0x80000002, 0x80000000); + status |= test__divsf3(0x00000000, 0x807fffff, 0x80000000); + status |= test__divsf3(0x00000000, 0x80800001, 0x80000000); + status |= test__divsf3(0x00000000, 0x81000000, 0x80000000); + status |= test__divsf3(0x00000000, 0xc0400000, 0x80000000); + status |= test__divsf3(0x00000000, 0xc0e00000, 0x80000000); + status |= test__divsf3(0x00000000, 0xfe7fffff, 0x80000000); + status |= test__divsf3(0x00000000, 0xff000000, 0x80000000); + status |= test__divsf3(0x00000000, 0xff800000, 0x80000000); + status |= test__divsf3(0x00000001, 0x00000000, 0x7f800000); + status |= test__divsf3(0x00000001, 0x3e000000, 0x00000008); + status |= test__divsf3(0x00000001, 0x3f000000, 0x00000002); + status |= test__divsf3(0x00000001, 0x40000000, 0x00000000); + status |= test__divsf3(0x00000001, 0x7f7fffff, 0x00000000); + status |= test__divsf3(0x00000001, 0x7f800000, 0x00000000); + status |= test__divsf3(0x00000001, 0xc0000000, 0x80000000); + status |= test__divsf3(0x00000001, 0xff7fffff, 0x80000000); + status |= test__divsf3(0x00000002, 0x80000000, 0xff800000); + status |= test__divsf3(0x00000002, 0xff800000, 0x80000000); + status |= test__divsf3(0x00000009, 0x41100000, 0x00000001); + status |= test__divsf3(0x00000009, 0xc1100000, 0x80000001); + status |= test__divsf3(0x007ffff7, 0x3f7ffffe, 0x007ffff8); + status |= test__divsf3(0x007ffffe, 0x3f7ffffe, 0x007fffff); + status |= test__divsf3(0x007fffff, 0x00000000, 0x7f800000); + status |= test__divsf3(0x007fffff, 0x3b000000, 0x04fffffe); + status |= test__divsf3(0x007fffff, 0x3f000000, 0x00fffffe); + status |= test__divsf3(0x007fffff, 0x3f800000, 0x007fffff); + status |= test__divsf3(0x007fffff, 0x3f800002, 0x007ffffd); + status |= test__divsf3(0x007fffff, 0x7f800000, 0x00000000); + status |= test__divsf3(0x007fffff, 0x80000000, 0xff800000); + status |= test__divsf3(0x007fffff, 0xbf800000, 0x807fffff); + status |= test__divsf3(0x007fffff, 0xff800000, 0x80000000); + status |= test__divsf3(0x00800000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x00800000, 0x3f800001, 0x007fffff); + status |= test__divsf3(0x00800000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x00800001, 0x3f800002, 0x007fffff); + status |= test__divsf3(0x00800001, 0x80000000, 0xff800000); + status |= test__divsf3(0x00800001, 0xff800000, 0x80000000); + status |= test__divsf3(0x00800002, 0x3f800006, 0x007ffffc); + status |= test__divsf3(0x00fffffe, 0x40000000, 0x007fffff); + status |= test__divsf3(0x00ffffff, 0x00000000, 0x7f800000); + status |= test__divsf3(0x00ffffff, 0x40000000, 0x00800000); + status |= test__divsf3(0x00ffffff, 0x7f800000, 0x00000000); + status |= test__divsf3(0x01000000, 0x00800000, 0x40000000); + status |= test__divsf3(0x01000000, 0x80000000, 0xff800000); + status |= test__divsf3(0x01000000, 0xc0000000, 0x80800000); + status |= test__divsf3(0x01000000, 0xff800000, 0x80000000); + status |= test__divsf3(0x01000001, 0x00800001, 0x40000000); + status |= test__divsf3(0x01000001, 0xc0000000, 0x80800001); + status |= test__divsf3(0x01000003, 0x80800003, 0xc0000000); + status |= test__divsf3(0x01000003, 0xc0000000, 0x80800003); + status |= test__divsf3(0x3f7ffff7, 0x3f7ffffb, 0x3f7ffffc); + status |= test__divsf3(0x3f7ffff7, 0x3f7ffffe, 0x3f7ffff9); + status |= test__divsf3(0x3f7ffff8, 0x3f7ffffc, 0x3f7ffffc); + status |= test__divsf3(0x3f7ffff8, 0x3f7ffffd, 0x3f7ffffb); + status |= test__divsf3(0x3f7ffffa, 0x3f7ffff9, 0x3f800001); + status |= test__divsf3(0x3f7ffffb, 0x3f7ffff9, 0x3f800001); + status |= test__divsf3(0x3f7ffffc, 0x3f7ffff9, 0x3f800002); + status |= test__divsf3(0x3f7ffffc, 0x3f7ffffd, 0x3f7fffff); + status |= test__divsf3(0x3f7ffffc, 0x3f7ffffe, 0x3f7ffffe); + status |= test__divsf3(0x3f7ffffc, 0x3f7fffff, 0x3f7ffffd); + status |= test__divsf3(0x3f7ffffc, 0x3f800001, 0x3f7ffffa); + status |= test__divsf3(0x3f7ffffd, 0x3f7ffff9, 0x3f800002); + status |= test__divsf3(0x3f7ffffd, 0x3f7ffffc, 0x3f800001); + status |= test__divsf3(0x3f7ffffd, 0x3f7ffffe, 0x3f7fffff); + status |= test__divsf3(0x3f7ffffd, 0x3f7fffff, 0x3f7ffffe); + status |= test__divsf3(0x3f7ffffd, 0x3f800001, 0x3f7ffffb); + status |= test__divsf3(0x3f7ffffd, 0x3f800002, 0x3f7ffff9); + status |= test__divsf3(0x3f7ffffe, 0x3f7ffff9, 0x3f800003); + status |= test__divsf3(0x3f7ffffe, 0x3f7ffffc, 0x3f800001); + status |= test__divsf3(0x3f7ffffe, 0x3f7ffffd, 0x3f800001); + status |= test__divsf3(0x3f7ffffe, 0x3f7fffff, 0x3f7fffff); + status |= test__divsf3(0x3f7ffffe, 0x3f800001, 0x3f7ffffc); + status |= test__divsf3(0x3f7ffffe, 0x3f800002, 0x3f7ffffa); + status |= test__divsf3(0x3f7ffffe, 0x3f800003, 0x3f7ffff8); + status |= test__divsf3(0x3f7fffff, 0x3f7ffff9, 0x3f800003); + status |= test__divsf3(0x3f7fffff, 0x3f7ffffc, 0x3f800002); + status |= test__divsf3(0x3f7fffff, 0x3f7ffffd, 0x3f800001); + status |= test__divsf3(0x3f7fffff, 0x3f7ffffe, 0x3f800001); + status |= test__divsf3(0x3f7fffff, 0x3f800001, 0x3f7ffffd); + status |= test__divsf3(0x3f7fffff, 0x3f800002, 0x3f7ffffb); + status |= test__divsf3(0x3f7fffff, 0x3f800003, 0x3f7ffff9); + status |= test__divsf3(0x3f7fffff, 0x3f800004, 0x3f7ffff7); + status |= test__divsf3(0x3f800000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x3f800000, 0x3f7ffff7, 0x3f800005); + status |= test__divsf3(0x3f800000, 0x3f7ffff8, 0x3f800004); + status |= test__divsf3(0x3f800000, 0x3f7ffffb, 0x3f800003); + status |= test__divsf3(0x3f800000, 0x3f7ffffc, 0x3f800002); + status |= test__divsf3(0x3f800000, 0x3f7ffffd, 0x3f800002); + status |= test__divsf3(0x3f800000, 0x3f7ffffe, 0x3f800001); + status |= test__divsf3(0x3f800000, 0x3f7fffff, 0x3f800001); + status |= test__divsf3(0x3f800000, 0x3f800000, 0x3f800000); + status |= test__divsf3(0x3f800000, 0x3f800001, 0x3f7ffffe); + status |= test__divsf3(0x3f800000, 0x3f800002, 0x3f7ffffc); + status |= test__divsf3(0x3f800000, 0x3f800003, 0x3f7ffffa); + status |= test__divsf3(0x3f800000, 0x3f800004, 0x3f7ffff8); + status |= test__divsf3(0x3f800000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x3f800001, 0x3f7ffffb, 0x3f800004); + status |= test__divsf3(0x3f800001, 0x3f7ffffd, 0x3f800003); + status |= test__divsf3(0x3f800001, 0x3f7ffffe, 0x3f800002); + status |= test__divsf3(0x3f800001, 0x3f7fffff, 0x3f800002); + status |= test__divsf3(0x3f800001, 0x3f800002, 0x3f7ffffe); + status |= test__divsf3(0x3f800001, 0x3f800003, 0x3f7ffffc); + status |= test__divsf3(0x3f800002, 0x3f7ffffc, 0x3f800004); + status |= test__divsf3(0x3f800002, 0x3f7ffffd, 0x3f800004); + status |= test__divsf3(0x3f800002, 0x3f7ffffe, 0x3f800003); + status |= test__divsf3(0x3f800002, 0x3f7fffff, 0x3f800003); + status |= test__divsf3(0x3f800002, 0x3f800001, 0x3f800001); + status |= test__divsf3(0x3f800002, 0x3f800003, 0x3f7ffffe); + status |= test__divsf3(0x3f800003, 0x3f7ffffd, 0x3f800005); + status |= test__divsf3(0x3f800003, 0x3f7ffffe, 0x3f800004); + status |= test__divsf3(0x3f800003, 0x3f7fffff, 0x3f800004); + status |= test__divsf3(0x3f800003, 0x3f800001, 0x3f800002); + status |= test__divsf3(0x3f800004, 0x3f7ffffe, 0x3f800005); + status |= test__divsf3(0x3f800004, 0x3f800001, 0x3f800003); + status |= test__divsf3(0x3f800004, 0x3f800007, 0x3f7ffffa); + status |= test__divsf3(0x3f800005, 0x3f7fffff, 0x3f800006); + status |= test__divsf3(0x3f800006, 0x3f800008, 0x3f7ffffc); + status |= test__divsf3(0x3f800007, 0x3f800002, 0x3f800005); + status |= test__divsf3(0x3f800009, 0x3f800008, 0x3f800001); + status |= test__divsf3(0x40000000, 0x3f800000, 0x40000000); + status |= test__divsf3(0x40000000, 0xbf800000, 0xc0000000); + status |= test__divsf3(0x40400000, 0x80000000, 0xff800000); + status |= test__divsf3(0x40400000, 0xc0400000, 0xbf800000); + status |= test__divsf3(0x40400000, 0xff800000, 0x80000000); + status |= test__divsf3(0x40a00000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x40a00000, 0x40a00000, 0x3f800000); + status |= test__divsf3(0x40a00000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x40e00000, 0x80000000, 0xff800000); + status |= test__divsf3(0x40e00000, 0xff800000, 0x80000000); + status |= test__divsf3(0x41000000, 0x40000000, 0x40800000); + status |= test__divsf3(0x41100000, 0x40400000, 0x40400000); + status |= test__divsf3(0x7b000000, 0x05000000, 0x7f800000); + status |= test__divsf3(0x7e7fffff, 0x80000000, 0xff800000); + status |= test__divsf3(0x7efffffd, 0xc0000000, 0xfe7ffffd); + status |= test__divsf3(0x7effffff, 0x00000000, 0x7f800000); + status |= test__divsf3(0x7effffff, 0x7f800000, 0x00000000); + status |= test__divsf3(0x7f000000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x7f000000, 0x007fffff, 0x7f800000); + status |= test__divsf3(0x7f000000, 0x3f000000, 0x7f800000); + status |= test__divsf3(0x7f000000, 0x40000000, 0x7e800000); + status |= test__divsf3(0x7f000000, 0x7f800000, 0x00000000); + status |= test__divsf3(0x7f000000, 0x80000000, 0xff800000); + status |= test__divsf3(0x7f000000, 0xbf000000, 0xff800000); + status |= test__divsf3(0x7f000000, 0xc0000000, 0xfe800000); + status |= test__divsf3(0x7f000000, 0xff800000, 0x80000000); + status |= test__divsf3(0x7f000003, 0xfe800003, 0xc0000000); + status |= test__divsf3(0x7f7ffffd, 0x40800000, 0x7e7ffffd); + status |= test__divsf3(0x7f7ffffd, 0xc0800000, 0xfe7ffffd); + status |= test__divsf3(0x7f7fffff, 0x00000001, 0x7f800000); + status |= test__divsf3(0x7f7fffff, 0x3f7fffff, 0x7f800000); + status |= test__divsf3(0x7f7fffff, 0x7e7fffff, 0x40800000); + status |= test__divsf3(0x7f7fffff, 0x7effffff, 0x40000000); + status |= test__divsf3(0x7f7fffff, 0xc0000000, 0xfeffffff); + status |= test__divsf3(0x7f7fffff, 0xfe7fffff, 0xc0800000); + status |= test__divsf3(0x7f7fffff, 0xff800000, 0x80000000); + status |= test__divsf3(0x7f800000, 0x00000000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x00000001, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x007fffff, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x00800000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x00ffffff, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x3f800000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x40a00000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x7effffff, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x7f000000, 0x7f800000); + status |= test__divsf3(0x7f800000, 0x80000000, 0xff800000); + status |= test__divsf3(0x7f800000, 0x80000002, 0xff800000); + status |= test__divsf3(0x7f800000, 0x807fffff, 0xff800000); + status |= test__divsf3(0x7f800000, 0x80800001, 0xff800000); + status |= test__divsf3(0x7f800000, 0x81000000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xc0400000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xc0e00000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xfe7fffff, 0xff800000); + status |= test__divsf3(0x7f800000, 0xff000000, 0xff800000); + status |= test__divsf3(0x7f800000, 0xff7fffff, 0xff800000); + status |= test__divsf3(0x80000000, 0x00000003, 0x80000000); + status |= test__divsf3(0x80000000, 0x007fffff, 0x80000000); + status |= test__divsf3(0x80000000, 0x00800001, 0x80000000); + status |= test__divsf3(0x80000000, 0x01000000, 0x80000000); + status |= test__divsf3(0x80000000, 0x40000000, 0x80000000); + status |= test__divsf3(0x80000000, 0x40c00000, 0x80000000); + status |= test__divsf3(0x80000000, 0x7e7fffff, 0x80000000); + status |= test__divsf3(0x80000000, 0x7e800000, 0x80000000); + status |= test__divsf3(0x80000000, 0x7f800000, 0x80000000); + status |= test__divsf3(0x80000000, 0x80000004, 0x00000000); + status |= test__divsf3(0x80000000, 0x807fffff, 0x00000000); + status |= test__divsf3(0x80000000, 0x80800000, 0x00000000); + status |= test__divsf3(0x80000000, 0x80ffffff, 0x00000000); + status |= test__divsf3(0x80000000, 0xc0800000, 0x00000000); + status |= test__divsf3(0x80000000, 0xc1000000, 0x00000000); + status |= test__divsf3(0x80000000, 0xfe800000, 0x00000000); + status |= test__divsf3(0x80000000, 0xfeffffff, 0x00000000); + status |= test__divsf3(0x80000000, 0xff800000, 0x00000000); + status |= test__divsf3(0x80000001, 0x3f000000, 0x80000002); + status |= test__divsf3(0x80000001, 0x40000000, 0x80000000); + status |= test__divsf3(0x80000001, 0x7f7fffff, 0x80000000); + status |= test__divsf3(0x80000001, 0xc0000000, 0x00000000); + status |= test__divsf3(0x80000001, 0xff7fffff, 0x00000000); + status |= test__divsf3(0x80000003, 0x00000000, 0xff800000); + status |= test__divsf3(0x80000003, 0x7f800000, 0x80000000); + status |= test__divsf3(0x80000004, 0x80000000, 0x7f800000); + status |= test__divsf3(0x80000004, 0xff800000, 0x00000000); + status |= test__divsf3(0x807ffff8, 0x3f7ffffe, 0x807ffff9); + status |= test__divsf3(0x807fffff, 0x00000000, 0xff800000); + status |= test__divsf3(0x807fffff, 0x7f800000, 0x80000000); + status |= test__divsf3(0x807fffff, 0x80000000, 0x7f800000); + status |= test__divsf3(0x807fffff, 0xff800000, 0x00000000); + status |= test__divsf3(0x80800000, 0x3f800001, 0x807fffff); + status |= test__divsf3(0x80800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0x80800000, 0xff800000, 0x00000000); + status |= test__divsf3(0x80800001, 0x00000000, 0xff800000); + status |= test__divsf3(0x80800001, 0x7f800000, 0x80000000); + status |= test__divsf3(0x80ffffff, 0x80000000, 0x7f800000); + status |= test__divsf3(0x80ffffff, 0xff800000, 0x00000000); + status |= test__divsf3(0x81000000, 0x00000000, 0xff800000); + status |= test__divsf3(0x81000000, 0x7f800000, 0x80000000); + status |= test__divsf3(0x81000001, 0x00800001, 0xc0000000); + status |= test__divsf3(0x81000005, 0x00800005, 0xc0000000); + status |= test__divsf3(0xbf800000, 0x3f800000, 0xbf800000); + status |= test__divsf3(0xbf800000, 0xbf800000, 0x3f800000); + status |= test__divsf3(0xc0000000, 0x00000000, 0xff800000); + status |= test__divsf3(0xc0000000, 0x3f800000, 0xc0000000); + status |= test__divsf3(0xc0000000, 0x7f800000, 0x80000000); + status |= test__divsf3(0xc0000000, 0xbf800000, 0x40000000); + status |= test__divsf3(0xc0800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xc0800000, 0xff800000, 0x00000000); + status |= test__divsf3(0xc0c00000, 0x00000000, 0xff800000); + status |= test__divsf3(0xc0c00000, 0x7f800000, 0x80000000); + status |= test__divsf3(0xc0c00000, 0xc0400000, 0x40000000); + status |= test__divsf3(0xc0e00000, 0x40e00000, 0xbf800000); + status |= test__divsf3(0xc1000000, 0x40000000, 0xc0800000); + status |= test__divsf3(0xc1000000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xc1000000, 0xff800000, 0x00000000); + status |= test__divsf3(0xc1100000, 0xc0400000, 0x40400000); + status |= test__divsf3(0xfe7fffff, 0x00000000, 0xff800000); + status |= test__divsf3(0xfe7fffff, 0x7f800000, 0x80000000); + status |= test__divsf3(0xfe800000, 0x00000000, 0xff800000); + status |= test__divsf3(0xfe800000, 0x7f800000, 0x80000000); + status |= test__divsf3(0xfe800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xfe800000, 0xff800000, 0x00000000); + status |= test__divsf3(0xfeffffff, 0x40000000, 0xfe7fffff); + status |= test__divsf3(0xfeffffff, 0x80000000, 0x7f800000); + status |= test__divsf3(0xff000000, 0x3f000000, 0xff800000); + status |= test__divsf3(0xff000000, 0xbf000000, 0x7f800000); + status |= test__divsf3(0xff000001, 0x7e800001, 0xc0000000); + status |= test__divsf3(0xff7ffffd, 0x40800000, 0xfe7ffffd); + status |= test__divsf3(0xff7ffffd, 0xc0800000, 0x7e7ffffd); + status |= test__divsf3(0xff7fffff, 0x7e7fffff, 0xc0800000); + status |= test__divsf3(0xff7fffff, 0xfe7fffff, 0x40800000); + status |= test__divsf3(0xff7fffff, 0xff800000, 0x00000000); + status |= test__divsf3(0xff800000, 0x00000000, 0xff800000); + status |= test__divsf3(0xff800000, 0x00000003, 0xff800000); + status |= test__divsf3(0xff800000, 0x007fffff, 0xff800000); + status |= test__divsf3(0xff800000, 0x00800001, 0xff800000); + status |= test__divsf3(0xff800000, 0x01000000, 0xff800000); + status |= test__divsf3(0xff800000, 0x40000000, 0xff800000); + status |= test__divsf3(0xff800000, 0x40c00000, 0xff800000); + status |= test__divsf3(0xff800000, 0x7e800000, 0xff800000); + status |= test__divsf3(0xff800000, 0x80000000, 0x7f800000); + status |= test__divsf3(0xff800000, 0x80000004, 0x7f800000); + status |= test__divsf3(0xff800000, 0x807fffff, 0x7f800000); + status |= test__divsf3(0xff800000, 0x80800000, 0x7f800000); + status |= test__divsf3(0xff800000, 0x80ffffff, 0x7f800000); + status |= test__divsf3(0xff800000, 0xc0800000, 0x7f800000); + status |= test__divsf3(0xff800000, 0xc1000000, 0x7f800000); + status |= test__divsf3(0xff800000, 0xfe800000, 0x7f800000); + status |= test__divsf3(0xff800000, 0xff7fffff, 0x7f800000); + status |= test__divsf3(0x2cbed883, 0x333f6113, 0x38ff4953); + status |= test__divsf3(0x3f87ffff, 0x7f001000, 0x0043f781); - // smallest normal value divided by 2.0 - if (test__divsf3(0x1.0p-126F, 2.0F, UINT32_C(0x00400000))) - return 1; - // smallest subnormal result - if (test__divsf3(0x1.0p-126F, 0x1p+23F, UINT32_C(0x00000001))) - return 1; + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000, + // which causes compareResultF to accept any NaN encoding. We also use the + // same value as the input NaN in tests that have one, so that even in + // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is + // still the exact expected NaN. + status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000); + status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000); + status |= test__divsf3(0x3f800000, 0x7fc00000, 0x7fc00000); + status |= test__divsf3(0x7fc00000, 0x3f800000, 0x7fc00000); + status |= test__divsf3(0x7fc00000, 0x7fc00000, 0x7fc00000); - // some misc test cases obtained by fuzzing against h/w implementation - if (test__divsf3(-0x1.3e75e6p-108F, -0x1.cf372p+38F, UINT32_C(0x00000006))) - return 1; - if (test__divsf3(0x1.e77c54p+81F, -0x1.e77c52p-47F, UINT32_C(0xff800000))) - return 1; - if (test__divsf3(0x1.fffffep-126F, 2.F, UINT32_C(0x00800000))) - return 1; +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by + // arm/divsf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7fc00000. - // test 1 / (1 - eps(0.5)) = 1 + eps(1) - if (test__divsf3(1.0F, 0x1.fffffep-1F, UINT32_C(0x3f800001))) - return 1; + status |= test__divsf3(0x00000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x00000000, 0x7fad4be3, 0x7fed4be3); + status |= test__divsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7); + status |= test__divsf3(0x00000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0x00000001, 0x7f970eba, 0x7fd70eba); + status |= test__divsf3(0x00000001, 0x7fc35716, 0x7fc35716); + status |= test__divsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6); + status |= test__divsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df); + status |= test__divsf3(0x3f800000, 0x7f987a85, 0x7fd87a85); + status |= test__divsf3(0x3f800000, 0x7fc50124, 0x7fc50124); + status |= test__divsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f); + status |= test__divsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc); + status |= test__divsf3(0x7f800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790); + status |= test__divsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b); + status |= test__divsf3(0x7f800000, 0xff800000, 0x7fc00000); + status |= test__divsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d); + status |= test__divsf3(0x7f93541e, 0x00000001, 0x7fd3541e); + status |= test__divsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002); + status |= test__divsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77); + status |= test__divsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92); + status |= test__divsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36); + status |= test__divsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008); + status |= test__divsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740); + status |= test__divsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b); + status |= test__divsf3(0x7f951a78, 0x80000001, 0x7fd51a78); + status |= test__divsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b); + status |= test__divsf3(0x7f89463c, 0xbf800000, 0x7fc9463c); + status |= test__divsf3(0x7fb63563, 0xff7fffff, 0x7ff63563); + status |= test__divsf3(0x7f90886e, 0xff800000, 0x7fd0886e); + status |= test__divsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e); + status |= test__divsf3(0x7fe915ae, 0x00000001, 0x7fe915ae); + status |= test__divsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42); + status |= test__divsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5); + status |= test__divsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb); + status |= test__divsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a); + status |= test__divsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816); + status |= test__divsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c); + status |= test__divsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb); + status |= test__divsf3(0x7ffa178b, 0x80000001, 0x7ffa178b); + status |= test__divsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b); + status |= test__divsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b); + status |= test__divsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c); + status |= test__divsf3(0x7fc55329, 0xff800000, 0x7fc55329); + status |= test__divsf3(0x80000000, 0x00000000, 0x7fc00000); + status |= test__divsf3(0x80000000, 0x7fa833ae, 0x7fe833ae); + status |= test__divsf3(0x80000000, 0x7fc4df63, 0x7fc4df63); + status |= test__divsf3(0x80000000, 0x80000000, 0x7fc00000); + status |= test__divsf3(0x80000001, 0x7f98827d, 0x7fd8827d); + status |= test__divsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5); + status |= test__divsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0); + status |= test__divsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907); + status |= test__divsf3(0xbf800000, 0x7fa95487, 0x7fe95487); + status |= test__divsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee); + status |= test__divsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21); + status |= test__divsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7); + status |= test__divsf3(0xff800000, 0x7f800000, 0x7fc00000); + status |= test__divsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc); + status |= test__divsf3(0xff800000, 0x7fde0397, 0x7fde0397); + status |= test__divsf3(0xff800000, 0xff800000, 0x7fc00000); +#endif // ARM_NAN_HANDLING - return 0; + return status; } diff --git a/compiler-rt/test/builtins/Unit/mulsf3_test.c b/compiler-rt/test/builtins/Unit/mulsf3_test.c new file mode 100644 index 0000000000000..7dc7c8ad39c32 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/mulsf3_test.c @@ -0,0 +1,616 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_mulsf3 + +#include "int_lib.h" +#include +#include + +#include "fp_test.h" + +// By default this test uses compareResultF to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7fc00000. For the Arm optimized FP implementation, which commits to a more +// detailed handling of NaNs, we tighten up the check and include some extra +// test cases specific to that NaN policy. +#if (__arm__ && !(__thumb__ && !__thumb2__)) && COMPILER_RT_ARM_OPTIMIZED_FP +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + +// Returns: a * b +COMPILER_RT_ABI float __mulsf3(float a, float b); + +int test__mulsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) { + float a = fromRep32(a_rep), b = fromRep32(b_rep); + float x = __mulsf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep32(x) == expected_rep; +#else + int ret = compareResultF(x, expected_rep); +#endif + + if (ret) { + printf("error in test__mulsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32 + ", expected %08" PRIx32 "\n", + a_rep, b_rep, toRep32(x), expected_rep); + } + return ret; +} + +int main(void) { + int status = 0; + + status |= test__mulsf3(0x00000000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x00000000, 0x007fffff, 0x00000000); + status |= test__mulsf3(0x00000000, 0x00ffffff, 0x00000000); + status |= test__mulsf3(0x00000000, 0x3f800000, 0x00000000); + status |= test__mulsf3(0x00000000, 0x7effffff, 0x00000000); + status |= test__mulsf3(0x00000000, 0x80000000, 0x80000000); + status |= test__mulsf3(0x00000000, 0x80000002, 0x80000000); + status |= test__mulsf3(0x00000000, 0x807fffff, 0x80000000); + status |= test__mulsf3(0x00000000, 0x80800001, 0x80000000); + status |= test__mulsf3(0x00000000, 0x81000000, 0x80000000); + status |= test__mulsf3(0x00000000, 0xc0400000, 0x80000000); + status |= test__mulsf3(0x00000000, 0xfe7fffff, 0x80000000); + status |= test__mulsf3(0x00000000, 0xff000000, 0x80000000); + status |= test__mulsf3(0x00000000, 0xff7fffff, 0x80000000); + status |= test__mulsf3(0x00000001, 0x00000000, 0x00000000); + status |= test__mulsf3(0x00000001, 0x00000001, 0x00000000); + status |= test__mulsf3(0x00000001, 0x3f000000, 0x00000000); + status |= test__mulsf3(0x00000001, 0x3f7fffff, 0x00000001); + status |= test__mulsf3(0x00000001, 0x3f800000, 0x00000001); + status |= test__mulsf3(0x00000001, 0x40000000, 0x00000002); + status |= test__mulsf3(0x00000001, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x00000001, 0xbf7fffff, 0x80000001); + status |= test__mulsf3(0x00000006, 0x3f000000, 0x00000003); + status |= test__mulsf3(0x00000006, 0xbf000000, 0x80000003); + status |= test__mulsf3(0x00000008, 0x3e000000, 0x00000001); + status |= test__mulsf3(0x007ffff7, 0x81000003, 0x80000000); + status |= test__mulsf3(0x007ffff8, 0x3f800001, 0x007ffff9); + status |= test__mulsf3(0x007ffff8, 0x3f800008, 0x00800000); + status |= test__mulsf3(0x007ffff8, 0xbf800001, 0x807ffff9); + status |= test__mulsf3(0x007ffff8, 0xbf800008, 0x80800000); + status |= test__mulsf3(0x007ffffc, 0x40000000, 0x00fffff8); + status |= test__mulsf3(0x007ffffe, 0x3f7ffffc, 0x007ffffc); + status |= test__mulsf3(0x007ffffe, 0x3f800001, 0x007fffff); + status |= test__mulsf3(0x007ffffe, 0xbf800001, 0x807fffff); + status |= test__mulsf3(0x007fffff, 0x007ffffe, 0x00000000); + status |= test__mulsf3(0x007fffff, 0x3f800001, 0x00800000); + status |= test__mulsf3(0x007fffff, 0x40000000, 0x00fffffe); + status |= test__mulsf3(0x00800000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x00800000, 0x00800000, 0x00000000); + status |= test__mulsf3(0x00800000, 0x3f7ffffe, 0x007fffff); + status |= test__mulsf3(0x00800000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x00800000, 0x80800000, 0x80000000); + status |= test__mulsf3(0x00800000, 0xc0000000, 0x81000000); + status |= test__mulsf3(0x00800001, 0x3f7ffffa, 0x007ffffe); + status |= test__mulsf3(0x00800001, 0x3f7ffffe, 0x00800000); + status |= test__mulsf3(0x00800001, 0xc0000000, 0x81000001); + status |= test__mulsf3(0x00800002, 0x3f7ffffc, 0x00800000); + status |= test__mulsf3(0x00fffff8, 0x3f000000, 0x007ffffc); + status |= test__mulsf3(0x00fffffe, 0x3f000000, 0x007fffff); + status |= test__mulsf3(0x00fffffe, 0xbf000000, 0x807fffff); + status |= test__mulsf3(0x00ffffff, 0x3f000000, 0x00800000); + status |= test__mulsf3(0x00ffffff, 0xbf000000, 0x80800000); + status |= test__mulsf3(0x3f000000, 0x80000001, 0x80000000); + status |= test__mulsf3(0x3f800000, 0x007ffffd, 0x007ffffd); + status |= test__mulsf3(0x3f800000, 0x01000003, 0x01000003); + status |= test__mulsf3(0x3f800000, 0x3f800000, 0x3f800000); + status |= test__mulsf3(0x3f800000, 0x40000000, 0x40000000); + status |= test__mulsf3(0x3f800000, 0x80000001, 0x80000001); + status |= test__mulsf3(0x3f800000, 0x80000009, 0x80000009); + status |= test__mulsf3(0x3f800001, 0x3f800001, 0x3f800002); + status |= test__mulsf3(0x3f800001, 0xbf800001, 0xbf800002); + status |= test__mulsf3(0x3f800001, 0xbf800002, 0xbf800003); + status |= test__mulsf3(0x3f800002, 0x3f800001, 0x3f800003); + status |= test__mulsf3(0x3f800002, 0x7f7ffffe, 0x7f800000); + status |= test__mulsf3(0x3f800001, 0x7f7ffffe, 0x7f800000); + status |= test__mulsf3(0x40000000, 0x00800000, 0x01000000); + status |= test__mulsf3(0x40000000, 0x00800001, 0x01000001); + status |= test__mulsf3(0x40000000, 0x3f800000, 0x40000000); + status |= test__mulsf3(0x40000000, 0x40400000, 0x40c00000); + status |= test__mulsf3(0x40000000, 0x7e800000, 0x7f000000); + status |= test__mulsf3(0x40000000, 0x7effffff, 0x7f7fffff); + status |= test__mulsf3(0x40000000, 0x807ffffd, 0x80fffffa); + status |= test__mulsf3(0x40000000, 0x80800003, 0x81000003); + status |= test__mulsf3(0x40000000, 0x80800005, 0x81000005); + status |= test__mulsf3(0x40000000, 0xbf800000, 0xc0000000); + status |= test__mulsf3(0x40000000, 0xfe7ffffd, 0xfefffffd); + status |= test__mulsf3(0x40000000, 0xfe800003, 0xff000003); + status |= test__mulsf3(0x403fffff, 0x3f7ffffd, 0x403ffffd); + status |= test__mulsf3(0x403fffff, 0x3f7ffffe, 0x403ffffe); + status |= test__mulsf3(0x403fffff, 0x3f7fffff, 0x403ffffe); + status |= test__mulsf3(0x403fffff, 0xbf7ffffd, 0xc03ffffd); + status |= test__mulsf3(0x40400000, 0x00000002, 0x00000006); + status |= test__mulsf3(0x40400000, 0x40000000, 0x40c00000); + status |= test__mulsf3(0x40400000, 0x40400000, 0x41100000); + status |= test__mulsf3(0x40400000, 0xc0000000, 0xc0c00000); + status |= test__mulsf3(0x40400001, 0x3f800001, 0x40400003); + status |= test__mulsf3(0x40400001, 0x3f800003, 0x40400006); + status |= test__mulsf3(0x40400001, 0xbf800003, 0xc0400006); + status |= test__mulsf3(0x40800000, 0x00000002, 0x00000008); + status |= test__mulsf3(0x40800000, 0x7e7fffff, 0x7f7fffff); + status |= test__mulsf3(0x40800000, 0xfe7fffff, 0xff7fffff); + status |= test__mulsf3(0x409fffff, 0x3f7fffff, 0x409ffffe); + status |= test__mulsf3(0x40a00000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x40a00000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x40a00001, 0x3f800001, 0x40a00002); + status |= test__mulsf3(0x40dfffff, 0x3f7ffffc, 0x40dffffc); + status |= test__mulsf3(0x40dfffff, 0x3f7fffff, 0x40dffffe); + status |= test__mulsf3(0x40e00000, 0x80000000, 0x80000000); + status |= test__mulsf3(0x40e00000, 0xff800000, 0xff800000); + status |= test__mulsf3(0x40e00001, 0x3f800001, 0x40e00003); + status |= test__mulsf3(0x7e7ffffd, 0x40800000, 0x7f7ffffd); + status |= test__mulsf3(0x7e7ffffd, 0xc0800000, 0xff7ffffd); + status |= test__mulsf3(0x7e800000, 0xc0000000, 0xff000000); + status |= test__mulsf3(0x7efffffd, 0xc0000008, 0xff800000); + status |= test__mulsf3(0x7effffff, 0xc0000000, 0xff7fffff); + status |= test__mulsf3(0x7f000000, 0x00000000, 0x00000000); + status |= test__mulsf3(0x7f000000, 0x40000000, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0x7f000000, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0x7f7ffffe, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x7f000000, 0xfe800000, 0xff800000); + status |= test__mulsf3(0x7f000000, 0xfe800004, 0xff800000); + status |= test__mulsf3(0x7f000000, 0xff000000, 0xff800000); + status |= test__mulsf3(0x7f000009, 0x7f7ffffa, 0x7f800000); + status |= test__mulsf3(0x7f000009, 0xc0c00002, 0xff800000); + status |= test__mulsf3(0x7f7fffff, 0x00000000, 0x00000000); + status |= test__mulsf3(0x7f800000, 0x007fffff, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x00ffffff, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x3f800000, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x7effffff, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x7f800000, 0x7f800000); + status |= test__mulsf3(0x7f800000, 0x80000002, 0xff800000); + status |= test__mulsf3(0x7f800000, 0x807fffff, 0xff800000); + status |= test__mulsf3(0x7f800000, 0x80800001, 0xff800000); + status |= test__mulsf3(0x7f800000, 0x81000000, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xc0400000, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xff000000, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xff7fffff, 0xff800000); + status |= test__mulsf3(0x7f800000, 0xff800000, 0xff800000); + status |= test__mulsf3(0x80000000, 0x00000000, 0x80000000); + status |= test__mulsf3(0x80000000, 0x40c00000, 0x80000000); + status |= test__mulsf3(0x80000000, 0x7f7fffff, 0x80000000); + status |= test__mulsf3(0x80000000, 0x80000000, 0x00000000); + status |= test__mulsf3(0x80000000, 0x80000004, 0x00000000); + status |= test__mulsf3(0x80000000, 0x80800000, 0x00000000); + status |= test__mulsf3(0x80000000, 0xc1000000, 0x00000000); + status |= test__mulsf3(0x80000000, 0xfe800000, 0x00000000); + status |= test__mulsf3(0x80000001, 0x00000001, 0x80000000); + status |= test__mulsf3(0x80000001, 0x40a00000, 0x80000005); + status |= test__mulsf3(0x80000002, 0x3f800000, 0x80000002); + status |= test__mulsf3(0x80000003, 0x00000000, 0x80000000); + status |= test__mulsf3(0x80000003, 0x7f800000, 0xff800000); + status |= test__mulsf3(0x80000004, 0xbf800000, 0x00000004); + status |= test__mulsf3(0x80000008, 0x3e000000, 0x80000001); + status |= test__mulsf3(0x807ffff7, 0x01000003, 0x80000000); + status |= test__mulsf3(0x807ffff7, 0x3f800001, 0x807ffff8); + status |= test__mulsf3(0x807ffffd, 0xc0000000, 0x00fffffa); + status |= test__mulsf3(0x807fffff, 0x00000000, 0x80000000); + status |= test__mulsf3(0x807fffff, 0x3f800001, 0x80800000); + status |= test__mulsf3(0x807fffff, 0x7f800000, 0xff800000); + status |= test__mulsf3(0x807fffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0x807fffff, 0x807ffffe, 0x00000000); + status |= test__mulsf3(0x807fffff, 0xbf800000, 0x007fffff); + status |= test__mulsf3(0x807fffff, 0xff800000, 0x7f800000); + status |= test__mulsf3(0x80800000, 0x00800000, 0x80000000); + status |= test__mulsf3(0x80800000, 0x80800000, 0x00000000); + status |= test__mulsf3(0x80800001, 0x00000000, 0x80000000); + status |= test__mulsf3(0x80800001, 0x7f800000, 0xff800000); + status |= test__mulsf3(0x80800001, 0xbf800000, 0x00800001); + status |= test__mulsf3(0x80fffffc, 0x3f000000, 0x807ffffe); + status |= test__mulsf3(0x80fffffc, 0xbf000000, 0x007ffffe); + status |= test__mulsf3(0x80fffffe, 0x3f800000, 0x80fffffe); + status |= test__mulsf3(0x80ffffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0x80ffffff, 0xff800000, 0x7f800000); + status |= test__mulsf3(0x81000000, 0x00000000, 0x80000000); + status |= test__mulsf3(0x81000000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xbf7fffff, 0xff7fffff, 0x7f7ffffe); + status |= test__mulsf3(0xbf800000, 0x00000009, 0x80000009); + status |= test__mulsf3(0xbf800000, 0x00800009, 0x80800009); + status |= test__mulsf3(0xbf800000, 0x3f800000, 0xbf800000); + status |= test__mulsf3(0xbf800000, 0x40000000, 0xc0000000); + status |= test__mulsf3(0xbf800000, 0xbf800000, 0x3f800000); + status |= test__mulsf3(0xbf800000, 0xc0000000, 0x40000000); + status |= test__mulsf3(0xbf800001, 0x3f800001, 0xbf800002); + status |= test__mulsf3(0xbf800001, 0xbf800001, 0x3f800002); + status |= test__mulsf3(0xbf800001, 0xbf800002, 0x3f800003); + status |= test__mulsf3(0xbf800002, 0x3f800001, 0xbf800003); + status |= test__mulsf3(0xbf800002, 0xbf800001, 0x3f800003); + status |= test__mulsf3(0xc0000000, 0x00000000, 0x80000000); + status |= test__mulsf3(0xc0000000, 0x007ffffd, 0x80fffffa); + status |= test__mulsf3(0xc0000000, 0x00800001, 0x81000001); + status |= test__mulsf3(0xc0000000, 0x00800005, 0x81000005); + status |= test__mulsf3(0xc0000000, 0x00800009, 0x81000009); + status |= test__mulsf3(0xc0000000, 0x40400000, 0xc0c00000); + status |= test__mulsf3(0xc0000000, 0x7e7fffff, 0xfeffffff); + status |= test__mulsf3(0xc0000000, 0x7e800001, 0xff000001); + status |= test__mulsf3(0xc0000000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xc0000000, 0xbf800000, 0x40000000); + status |= test__mulsf3(0xc0000000, 0xc0400000, 0x40c00000); + status |= test__mulsf3(0xc03ffffe, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xc03fffff, 0x3f7fffff, 0xc03ffffe); + status |= test__mulsf3(0xc0400000, 0x40400000, 0xc1100000); + status |= test__mulsf3(0xc0400000, 0xc0000000, 0x40c00000); + status |= test__mulsf3(0xc0400000, 0xc0400000, 0x41100000); + status |= test__mulsf3(0xc0400000, 0xff000000, 0x7f800000); + status |= test__mulsf3(0xc0400001, 0x3f800001, 0xc0400003); + status |= test__mulsf3(0xc0800000, 0x7e7fffff, 0xff7fffff); + status |= test__mulsf3(0xc0800000, 0x80000000, 0x00000000); + status |= test__mulsf3(0xc0800000, 0xfe7fffff, 0x7f7fffff); + status |= test__mulsf3(0xc0800000, 0xff800000, 0x7f800000); + status |= test__mulsf3(0xc09ffffe, 0xff000000, 0x7f800000); + status |= test__mulsf3(0xc09fffff, 0xbf7fffff, 0x409ffffe); + status |= test__mulsf3(0xc0a00001, 0xbf800001, 0x40a00002); + status |= test__mulsf3(0xc0dffff9, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xc1100000, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xc1100001, 0xff000000, 0x7f800000); + status |= test__mulsf3(0xfe7ffff9, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xfe7ffff9, 0xc07fffff, 0x7f7ffff8); + status |= test__mulsf3(0xfe7ffffd, 0x40800000, 0xff7ffffd); + status |= test__mulsf3(0xfe7ffffd, 0xc0800000, 0x7f7ffffd); + status |= test__mulsf3(0xfe7fffff, 0x00000000, 0x80000000); + status |= test__mulsf3(0xfe7fffff, 0x40000001, 0xff000000); + status |= test__mulsf3(0xfe7fffff, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xfe800000, 0x00000000, 0x80000000); + status |= test__mulsf3(0xfe800000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xfefffff7, 0x7e800001, 0xff800000); + status |= test__mulsf3(0xfeffffff, 0x3f800001, 0xff000000); + status |= test__mulsf3(0xfeffffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0xff000005, 0xff000001, 0x7f800000); + status |= test__mulsf3(0xff7ffffd, 0x7f000000, 0xff800000); + status |= test__mulsf3(0xff7ffffd, 0xc0400001, 0x7f800000); + status |= test__mulsf3(0xff7ffffd, 0xff000001, 0x7f800000); + status |= test__mulsf3(0xff7fffff, 0x80000000, 0x00000000); + status |= test__mulsf3(0xff7fffff, 0xff7fffff, 0x7f800000); + status |= test__mulsf3(0xff7fffff, 0xff800000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0x40c00000, 0xff800000); + status |= test__mulsf3(0xff800000, 0x7f800000, 0xff800000); + status |= test__mulsf3(0xff800000, 0x80000004, 0x7f800000); + status |= test__mulsf3(0xff800000, 0x80800000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0xc1000000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0xfe800000, 0x7f800000); + status |= test__mulsf3(0xff800000, 0xff800000, 0x7f800000); + status |= test__mulsf3(0x3089705f, 0x0ef36390, 0x0041558f); + status |= test__mulsf3(0x3089705f, 0x0e936390, 0x0027907d); + status |= test__mulsf3(0x3109705f, 0x0ef36390, 0x0082ab1e); + status |= test__mulsf3(0x3109705f, 0x0e936390, 0x004f20fa); + status |= test__mulsf3(0x3189705f, 0x0ef36390, 0x0102ab1e); + status |= test__mulsf3(0x3189705f, 0x0e936390, 0x009e41f5); + status |= test__mulsf3(0xb089705f, 0x0ef36390, 0x8041558f); + status |= test__mulsf3(0xb089705f, 0x0e936390, 0x8027907d); + status |= test__mulsf3(0xb109705f, 0x0ef36390, 0x8082ab1e); + status |= test__mulsf3(0xb109705f, 0x0e936390, 0x804f20fa); + status |= test__mulsf3(0xb189705f, 0x0ef36390, 0x8102ab1e); + status |= test__mulsf3(0xb189705f, 0x0e936390, 0x809e41f5); + status |= test__mulsf3(0x3089705f, 0x8ef36390, 0x8041558f); + status |= test__mulsf3(0x3089705f, 0x8e936390, 0x8027907d); + status |= test__mulsf3(0x3109705f, 0x8ef36390, 0x8082ab1e); + status |= test__mulsf3(0x3109705f, 0x8e936390, 0x804f20fa); + status |= test__mulsf3(0x3189705f, 0x8ef36390, 0x8102ab1e); + status |= test__mulsf3(0x3189705f, 0x8e936390, 0x809e41f5); + status |= test__mulsf3(0xb089705f, 0x8ef36390, 0x0041558f); + status |= test__mulsf3(0xb089705f, 0x8e936390, 0x0027907d); + status |= test__mulsf3(0xb109705f, 0x8ef36390, 0x0082ab1e); + status |= test__mulsf3(0xb109705f, 0x8e936390, 0x004f20fa); + status |= test__mulsf3(0xb189705f, 0x8ef36390, 0x0102ab1e); + status |= test__mulsf3(0xb189705f, 0x8e936390, 0x009e41f5); + status |= test__mulsf3(0x1f800001, 0x1fc00000, 0x00300000); + status |= test__mulsf3(0x1f800003, 0x1fc00000, 0x00300001); + status |= test__mulsf3(0x1f800001, 0x1fc00800, 0x00300200); + status |= test__mulsf3(0x1f800003, 0x1fc00800, 0x00300201); + status |= test__mulsf3(0x36e4588a, 0x29b47cbd, 0x2120fd85); + status |= test__mulsf3(0x3fea3b26, 0x3f400000, 0x3fafac5c); + status |= test__mulsf3(0x6fea3b26, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20ea3b26, 0x1ec00000, 0x0057d62e); + status |= test__mulsf3(0x3f8f11bb, 0x3fc00000, 0x3fd69a98); + status |= test__mulsf3(0x6f8f11bb, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f400000, 0x006b4d4c); + status |= test__mulsf3(0x3f8f11bb, 0x3f800000, 0x3f8f11bb); + status |= test__mulsf3(0x6f8f11bb, 0x4f800000, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f000000, 0x004788de); + status |= test__mulsf3(0x3f8f11bb, 0x3fd7f48d, 0x3ff1611f); + status |= test__mulsf3(0x6f8f11bb, 0x4fd7f48d, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f57f48d, 0x0078b090); + status |= test__mulsf3(0x3f8f11bb, 0x3fa80b73, 0x3fbbd412); + status |= test__mulsf3(0x6f8f11bb, 0x4fa80b73, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f280b73, 0x005dea09); + status |= test__mulsf3(0x3f8f11bb, 0x3f97f48d, 0x3fa9d842); + status |= test__mulsf3(0x6f8f11bb, 0x4f97f48d, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1f17f48d, 0x0054ec21); + status |= test__mulsf3(0x3f8f11bb, 0x3f680b73, 0x3f81ae78); + status |= test__mulsf3(0x6f8f11bb, 0x4f680b73, 0x7f800000); + status |= test__mulsf3(0x208f11bb, 0x1ee80b73, 0x0040d73c); + status |= test__mulsf3(0x3fff5dd8, 0x3f600000, 0x3fdf721d); + status |= test__mulsf3(0x6fff5dd8, 0x4f600000, 0x7f800000); + status |= test__mulsf3(0x20ff5dd8, 0x1ee00000, 0x006fb90e); + status |= test__mulsf3(0x3fff5dd8, 0x3f100000, 0x3f8fa4ca); + status |= test__mulsf3(0x6fff5dd8, 0x4f100000, 0x7f800000); + status |= test__mulsf3(0x20ff5dd8, 0x1e900000, 0x0047d265); + status |= test__mulsf3(0x3fffe96b, 0x3f7efb43, 0x3ffee4c5); + status |= test__mulsf3(0x6fffe96b, 0x4f7efb43, 0x7f800000); + status |= test__mulsf3(0x20ffe96b, 0x1efefb43, 0x007f7263); + status |= test__mulsf3(0x3fffe96b, 0x3f0104bd, 0x3f80f95b); + status |= test__mulsf3(0x6fffe96b, 0x4f0104bd, 0x7f800000); + status |= test__mulsf3(0x20ffe96b, 0x1e8104bd, 0x00407cae); + status |= test__mulsf3(0x3f8fbbb7, 0x3fa6edf9, 0x3fbb72aa); + status |= test__mulsf3(0x6f8fbbb7, 0x4fa6edf9, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1f26edf9, 0x005db955); + status |= test__mulsf3(0x3f8fbbb7, 0x3fd91207, 0x3ff3c07b); + status |= test__mulsf3(0x6f8fbbb7, 0x4fd91207, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1f591207, 0x0079e03d); + status |= test__mulsf3(0x3f8fbbb7, 0x3f991207, 0x3fabe29f); + status |= test__mulsf3(0x6f8fbbb7, 0x4f991207, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1f191207, 0x0055f150); + status |= test__mulsf3(0x3f8fbbb7, 0x3f66edf9, 0x3f81a843); + status |= test__mulsf3(0x6f8fbbb7, 0x4f66edf9, 0x7f800000); + status |= test__mulsf3(0x208fbbb7, 0x1ee6edf9, 0x0040d421); + status |= test__mulsf3(0x3fdb62f3, 0x3f7879c5, 0x3fd4f036); + status |= test__mulsf3(0x6fdb62f3, 0x4f7879c5, 0x7f800000); + status |= test__mulsf3(0x20db62f3, 0x1ef879c5, 0x006a781b); + status |= test__mulsf3(0x3faaea45, 0x3f8b6773, 0x3fba2489); + status |= test__mulsf3(0x6faaea45, 0x4f8b6773, 0x7f800000); + status |= test__mulsf3(0x20aaea45, 0x1f0b6773, 0x005d1244); + status |= test__mulsf3(0x3fafa7ec, 0x3f900000, 0x3fc59cea); + status |= test__mulsf3(0x6fafa7ec, 0x4f900000, 0x7f800000); + status |= test__mulsf3(0x20afa7ec, 0x1f100000, 0x0062ce75); + status |= test__mulsf3(0x3fcf8c8d, 0x3f271645, 0x3f8776be); + status |= test__mulsf3(0x6fcf8c8d, 0x4f271645, 0x7f800000); + status |= test__mulsf3(0x20cf8c8d, 0x1ea71645, 0x0043bb5f); + status |= test__mulsf3(0x3fc173ef, 0x3f901b0f, 0x3fd9cb52); + status |= test__mulsf3(0x6fc173ef, 0x4f901b0f, 0x7f800000); + status |= test__mulsf3(0x20c173ef, 0x1f101b0f, 0x006ce5a9); + status |= test__mulsf3(0x3fb48d33, 0x3f4a35fb, 0x3f8e9d7d); + status |= test__mulsf3(0x6fb48d33, 0x4f4a35fb, 0x7f800000); + status |= test__mulsf3(0x20b48d33, 0x1eca35fb, 0x00474ebe); + status |= test__mulsf3(0x3fc6f87b, 0x3f65d94d, 0x3fb2a52a); + status |= test__mulsf3(0x6fc6f87b, 0x4f65d94d, 0x7f800000); + status |= test__mulsf3(0x20c6f87b, 0x1ee5d94d, 0x00595295); + status |= test__mulsf3(0x3f860ae7, 0x3f969729, 0x3f9db312); + status |= test__mulsf3(0x6f860ae7, 0x4f969729, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f169729, 0x004ed989); + status |= test__mulsf3(0x3f860ae7, 0x3fc00000, 0x3fc9105a); + status |= test__mulsf3(0x6f860ae7, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f400000, 0x0064882d); + status |= test__mulsf3(0x3f860ae7, 0x3fe968d7, 0x3ff46da3); + status |= test__mulsf3(0x6f860ae7, 0x4fe968d7, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f6968d7, 0x007a36d1); + status |= test__mulsf3(0x3f860ae7, 0x3f800000, 0x3f860ae7); + status |= test__mulsf3(0x6f860ae7, 0x4f800000, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f000000, 0x00430574); + status |= test__mulsf3(0x3f860ae7, 0x3fa968d7, 0x3fb1682f); + status |= test__mulsf3(0x6f860ae7, 0x4fa968d7, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f2968d7, 0x0058b418); + status |= test__mulsf3(0x3f860ae7, 0x3fd69729, 0x3fe0b886); + status |= test__mulsf3(0x6f860ae7, 0x4fd69729, 0x7f800000); + status |= test__mulsf3(0x20860ae7, 0x1f569729, 0x00705c43); + status |= test__mulsf3(0x3f9aecdd, 0x3fb14b75, 0x3fd696de); + status |= test__mulsf3(0x6f9aecdd, 0x4fb14b75, 0x7f800000); + status |= test__mulsf3(0x209aecdd, 0x1f314b75, 0x006b4b6f); + status |= test__mulsf3(0x3f9aecdd, 0x3fceb48b, 0x3ffa2fb9); + status |= test__mulsf3(0x6f9aecdd, 0x4fceb48b, 0x7f800000); + status |= test__mulsf3(0x209aecdd, 0x1f4eb48b, 0x007d17dc); + status |= test__mulsf3(0x3f9aecdd, 0x3fc00000, 0x3fe8634c); + status |= test__mulsf3(0x6f9aecdd, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x209aecdd, 0x1f400000, 0x007431a6); + status |= test__mulsf3(0x3fd65dc6, 0x3f400000, 0x3fa0c654); + status |= test__mulsf3(0x6fd65dc6, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20d65dc6, 0x1ec00000, 0x0050632a); + status |= test__mulsf3(0x3feecf03, 0x3f5f93ab, 0x3fd09014); + status |= test__mulsf3(0x6feecf03, 0x4f5f93ab, 0x7f800000); + status |= test__mulsf3(0x20eecf03, 0x1edf93ab, 0x0068480a); + status |= test__mulsf3(0x3feecf03, 0x3f206c55, 0x3f95a670); + status |= test__mulsf3(0x6feecf03, 0x4f206c55, 0x7f800000); + status |= test__mulsf3(0x20eecf03, 0x1ea06c55, 0x004ad338); + status |= test__mulsf3(0x3f98feed, 0x3f60f11b, 0x3f866f27); + status |= test__mulsf3(0x6f98feed, 0x4f60f11b, 0x7f800000); + status |= test__mulsf3(0x2098feed, 0x1ee0f11b, 0x00433794); + status |= test__mulsf3(0x3f9a1b9d, 0x3f9c42b5, 0x3fbc21f8); + status |= test__mulsf3(0x6f9a1b9d, 0x4f9c42b5, 0x7f800000); + status |= test__mulsf3(0x209a1b9d, 0x1f1c42b5, 0x005e10fc); + status |= test__mulsf3(0x3f9a1b9d, 0x3f5c42b5, 0x3f8497e3); + status |= test__mulsf3(0x6f9a1b9d, 0x4f5c42b5, 0x7f800000); + status |= test__mulsf3(0x209a1b9d, 0x1edc42b5, 0x00424bf2); + status |= test__mulsf3(0x3f947044, 0x3f600000, 0x3f81e23c); + status |= test__mulsf3(0x6f947044, 0x4f600000, 0x7f800000); + status |= test__mulsf3(0x20947044, 0x1ee00000, 0x0040f11e); + status |= test__mulsf3(0x3fa3fb77, 0x3f6eb1b9, 0x3f98e5a0); + status |= test__mulsf3(0x6fa3fb77, 0x4f6eb1b9, 0x7f800000); + status |= test__mulsf3(0x20a3fb77, 0x1eeeb1b9, 0x004c72d0); + status |= test__mulsf3(0x3fb291df, 0x3f466a1f, 0x3f8a66d9); + status |= test__mulsf3(0x6fb291df, 0x4f466a1f, 0x7f800000); + status |= test__mulsf3(0x20b291df, 0x1ec66a1f, 0x0045336c); + status |= test__mulsf3(0x3fde13d5, 0x3f6b7283, 0x3fcc3f8b); + status |= test__mulsf3(0x6fde13d5, 0x4f6b7283, 0x7f800000); + status |= test__mulsf3(0x20de13d5, 0x1eeb7283, 0x00661fc5); + status |= test__mulsf3(0x3fd5b211, 0x3f80810f, 0x3fd68987); + status |= test__mulsf3(0x6fd5b211, 0x4f80810f, 0x7f800000); + status |= test__mulsf3(0x20d5b211, 0x1f00810f, 0x006b44c4); + status |= test__mulsf3(0x3fd5b211, 0x3f3f7ef1, 0x3f9fd9d2); + status |= test__mulsf3(0x6fd5b211, 0x4f3f7ef1, 0x7f800000); + status |= test__mulsf3(0x20d5b211, 0x1ebf7ef1, 0x004fece9); + status |= test__mulsf3(0x3fadfbc4, 0x3f400000, 0x3f827cd3); + status |= test__mulsf3(0x6fadfbc4, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20adfbc4, 0x1ec00000, 0x00413e6a); + status |= test__mulsf3(0x3fd0ef03, 0x3f800000, 0x3fd0ef03); + status |= test__mulsf3(0x6fd0ef03, 0x4f800000, 0x7f800000); + status |= test__mulsf3(0x20d0ef03, 0x1f000000, 0x00687782); + status |= test__mulsf3(0x3fd0ef03, 0x3f8673ab, 0x3fdb7705); + status |= test__mulsf3(0x6fd0ef03, 0x4f8673ab, 0x7f800000); + status |= test__mulsf3(0x20d0ef03, 0x1f0673ab, 0x006dbb83); + status |= test__mulsf3(0x3fd0ef03, 0x3f798c55, 0x3fcbab02); + status |= test__mulsf3(0x6fd0ef03, 0x4f798c55, 0x7f800000); + status |= test__mulsf3(0x20d0ef03, 0x1ef98c55, 0x0065d581); + status |= test__mulsf3(0x3fdd1181, 0x3f8ad17f, 0x3fefc0b1); + status |= test__mulsf3(0x6fdd1181, 0x4f8ad17f, 0x7f800000); + status |= test__mulsf3(0x20dd1181, 0x1f0ad17f, 0x0077e058); + status |= test__mulsf3(0x3fdd1181, 0x3f752e81, 0x3fd3b9e9); + status |= test__mulsf3(0x6fdd1181, 0x4f752e81, 0x7f800000); + status |= test__mulsf3(0x20dd1181, 0x1ef52e81, 0x0069dcf5); + status |= test__mulsf3(0x3f92efc6, 0x3fa00000, 0x3fb7abb8); + status |= test__mulsf3(0x6f92efc6, 0x4fa00000, 0x7f800000); + status |= test__mulsf3(0x2092efc6, 0x1f200000, 0x005bd5dc); + status |= test__mulsf3(0x3fdcefe6, 0x3f400000, 0x3fa5b3ec); + status |= test__mulsf3(0x6fdcefe6, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20dcefe6, 0x1ec00000, 0x0052d9f6); + status |= test__mulsf3(0x3fad6507, 0x3fa2f8b7, 0x3fdcc4c9); + status |= test__mulsf3(0x6fad6507, 0x4fa2f8b7, 0x7f800000); + status |= test__mulsf3(0x20ad6507, 0x1f22f8b7, 0x006e6264); + status |= test__mulsf3(0x3fad6507, 0x3f62f8b7, 0x3f99bba6); + status |= test__mulsf3(0x6fad6507, 0x4f62f8b7, 0x7f800000); + status |= test__mulsf3(0x20ad6507, 0x1ee2f8b7, 0x004cddd3); + status |= test__mulsf3(0x3fbfde6b, 0x3f8721bd, 0x3fca8f27); + status |= test__mulsf3(0x6fbfde6b, 0x4f8721bd, 0x7f800000); + status |= test__mulsf3(0x20bfde6b, 0x1f0721bd, 0x00654794); + status |= test__mulsf3(0x3fbfde6b, 0x3f4721bd, 0x3f953f2e); + status |= test__mulsf3(0x6fbfde6b, 0x4f4721bd, 0x7f800000); + status |= test__mulsf3(0x20bfde6b, 0x1ec721bd, 0x004a9f97); + status |= test__mulsf3(0x3ff40db4, 0x3f400000, 0x3fb70a47); + status |= test__mulsf3(0x6ff40db4, 0x4f400000, 0x7f800000); + status |= test__mulsf3(0x20f40db4, 0x1ec00000, 0x005b8524); + status |= test__mulsf3(0x3ff40db4, 0x3f600000, 0x3fd58bfe); + status |= test__mulsf3(0x6ff40db4, 0x4f600000, 0x7f800000); + status |= test__mulsf3(0x20f40db4, 0x1ee00000, 0x006ac5ff); + status |= test__mulsf3(0x3f9e20d3, 0x3f90c8a5, 0x3fb2dccc); + status |= test__mulsf3(0x6f9e20d3, 0x4f90c8a5, 0x7f800000); + status |= test__mulsf3(0x209e20d3, 0x1f10c8a5, 0x00596e66); + status |= test__mulsf3(0x3f9e20d3, 0x3fc00000, 0x3fed313c); + status |= test__mulsf3(0x6f9e20d3, 0x4fc00000, 0x7f800000); + status |= test__mulsf3(0x209e20d3, 0x1f400000, 0x0076989e); + status |= test__mulsf3(0x3f9e20d3, 0x3f50c8a5, 0x3f80f69b); + status |= test__mulsf3(0x6f9e20d3, 0x4f50c8a5, 0x7f800000); + status |= test__mulsf3(0x209e20d3, 0x1ed0c8a5, 0x00407b4d); + status |= test__mulsf3(0x3f82e641, 0x3f8fd63f, 0x3f931856); + status |= test__mulsf3(0x6f82e641, 0x4f8fd63f, 0x7f800000); + status |= test__mulsf3(0x2082e641, 0x1f0fd63f, 0x00498c2b); + status |= test__mulsf3(0x3f9a1901, 0x3f96e701, 0x3fb5ab68); + status |= test__mulsf3(0x6f9a1901, 0x4f96e701, 0x7f800000); + status |= test__mulsf3(0x209a1901, 0x1f16e701, 0x005ad5b4); + status |= test__mulsf3(0x3fa21aa1, 0x3f7c4961, 0x3f9fc0ae); + status |= test__mulsf3(0x6fa21aa1, 0x4f7c4961, 0x7f800000); + status |= test__mulsf3(0x20a21aa1, 0x1efc4961, 0x004fe057); + status |= test__mulsf3(0x3fcd0767, 0x3f782457, 0x3fc6bc47); + status |= test__mulsf3(0x6fcd0767, 0x4f782457, 0x7f800000); + status |= test__mulsf3(0x20cd0767, 0x1ef82457, 0x00635e23); + status |= test__mulsf3(0x3fb875e1, 0x3f968e21, 0x3fd8f6f6); + status |= test__mulsf3(0x6fb875e1, 0x4f968e21, 0x7f800000); + status |= test__mulsf3(0x20b875e1, 0x1f168e21, 0x006c7b7b); + status |= test__mulsf3(0x3fc2f0d7, 0x3f5efd19, 0x3fa9cd95); + status |= test__mulsf3(0x6fc2f0d7, 0x4f5efd19, 0x7f800000); + status |= test__mulsf3(0x20c2f0d7, 0x1edefd19, 0x0054e6cb); + status |= test__mulsf3(0x7f7ffffe, 0x3f800001, 0x7f800000); + status |= test__mulsf3(0x00000003, 0xc00fffff, 0x80000007); + status |= test__mulsf3(0x00000003, 0x400fffff, 0x00000007); + status |= test__mulsf3(0x80000003, 0xc00fffff, 0x00000007); + status |= test__mulsf3(0x80000003, 0x400fffff, 0x80000007); + status |= test__mulsf3(0x00000003, 0xc00ffffd, 0x80000007); + status |= test__mulsf3(0x00000003, 0x400ffffd, 0x00000007); + status |= test__mulsf3(0x80000003, 0xc00ffffd, 0x00000007); + status |= test__mulsf3(0x80000003, 0x400ffffd, 0x80000007); + status |= test__mulsf3(0x3e00007f, 0x017c0000, 0x003f003f); + status |= test__mulsf3(0xcf7fff00, 0xc0ffff00, 0x50fffe00); + status |= test__mulsf3(0x3fdf7f00, 0x3fffff00, 0x405f7e21); + status |= test__mulsf3(0x19b92144, 0x1a310000, 0x00000001); + status |= test__mulsf3(0x19ffc008, 0x1a002004, 0x00000001); + status |= test__mulsf3(0x7f7ffff0, 0xc0000008, 0xff800000); + + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000, + // which causes compareResultF to accept any NaN encoding. We also use the + // same value as the input NaN in tests that have one, so that even in + // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is + // still the exact expected NaN. + status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000); + status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000); + status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000); + status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000); + status |= test__mulsf3(0x3f800000, 0x7fc00000, 0x7fc00000); + status |= test__mulsf3(0x7fc00000, 0x3f800000, 0x7fc00000); + status |= test__mulsf3(0x7fc00000, 0x7fc00000, 0x7fc00000); + +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by + // arm/mulsf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7fc00000. + + status |= test__mulsf3(0x00000000, 0x7fad4be3, 0x7fed4be3); + status |= test__mulsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7); + status |= test__mulsf3(0x00000001, 0x7f970eba, 0x7fd70eba); + status |= test__mulsf3(0x00000001, 0x7fc35716, 0x7fc35716); + status |= test__mulsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6); + status |= test__mulsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df); + status |= test__mulsf3(0x3f800000, 0x7f987a85, 0x7fd87a85); + status |= test__mulsf3(0x3f800000, 0x7fc50124, 0x7fc50124); + status |= test__mulsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f); + status |= test__mulsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc); + status |= test__mulsf3(0x7f800000, 0x00000000, 0x7fc00000); + status |= test__mulsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790); + status |= test__mulsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b); + status |= test__mulsf3(0x7f800000, 0x80000000, 0x7fc00000); + status |= test__mulsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d); + status |= test__mulsf3(0x7f93541e, 0x00000001, 0x7fd3541e); + status |= test__mulsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002); + status |= test__mulsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77); + status |= test__mulsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92); + status |= test__mulsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36); + status |= test__mulsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008); + status |= test__mulsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740); + status |= test__mulsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b); + status |= test__mulsf3(0x7f951a78, 0x80000001, 0x7fd51a78); + status |= test__mulsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b); + status |= test__mulsf3(0x7f89463c, 0xbf800000, 0x7fc9463c); + status |= test__mulsf3(0x7fb63563, 0xff7fffff, 0x7ff63563); + status |= test__mulsf3(0x7f90886e, 0xff800000, 0x7fd0886e); + status |= test__mulsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e); + status |= test__mulsf3(0x7fe915ae, 0x00000001, 0x7fe915ae); + status |= test__mulsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42); + status |= test__mulsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5); + status |= test__mulsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb); + status |= test__mulsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a); + status |= test__mulsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816); + status |= test__mulsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c); + status |= test__mulsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb); + status |= test__mulsf3(0x7ffa178b, 0x80000001, 0x7ffa178b); + status |= test__mulsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b); + status |= test__mulsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b); + status |= test__mulsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c); + status |= test__mulsf3(0x7fc55329, 0xff800000, 0x7fc55329); + status |= test__mulsf3(0x80000000, 0x7f800000, 0x7fc00000); + status |= test__mulsf3(0x80000000, 0x7fa833ae, 0x7fe833ae); + status |= test__mulsf3(0x80000000, 0x7fc4df63, 0x7fc4df63); + status |= test__mulsf3(0x80000000, 0xff800000, 0x7fc00000); + status |= test__mulsf3(0x80000001, 0x7f98827d, 0x7fd8827d); + status |= test__mulsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5); + status |= test__mulsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0); + status |= test__mulsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907); + status |= test__mulsf3(0xbf800000, 0x7fa95487, 0x7fe95487); + status |= test__mulsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee); + status |= test__mulsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21); + status |= test__mulsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7); + status |= test__mulsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc); + status |= test__mulsf3(0xff800000, 0x7fde0397, 0x7fde0397); +#endif // ARM_NAN_HANDLING + + return status; +}