diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8676060eb3db7..2372d7dfe7c3c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" +#include "llvm/ADT/APSInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/IntervalMap.h" @@ -18873,6 +18874,8 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const TargetLowering &TLI) { // We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc. + // Additionally, if there are clamps ([us]min or [us]max) around + // the fpto[us]i, we can fold those into fminnum/fmaxnum around the ftrunc. // If NoSignedZerosFPMath is enabled, this is a direct replacement. // Otherwise, for strict math, we must handle edge cases: // 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0 @@ -18884,28 +18887,68 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, if (!TLI.isOperationLegal(ISD::FTRUNC, VT)) return SDValue(); - // fptosi/fptoui round towards zero, so converting from FP to integer and - // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X - SDValue N0 = N->getOperand(0); - if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT && - N0.getOperand(0).getValueType() == VT) { - if (DAG.getTarget().Options.NoSignedZerosFPMath) - return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); - } + bool IsUnsigned = N->getOpcode() == ISD::UINT_TO_FP; + bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP; + assert(IsSigned || IsUnsigned); - if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT && - N0.getOperand(0).getValueType() == VT) { - if (DAG.getTarget().Options.NoSignedZerosFPMath) - return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath; + // For signed conversions: The optimization changes signed zero behavior. + if (IsSigned && !IsSignedZeroSafe) + return SDValue(); + // For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0 + // (unless NoSignedZerosFPMath is set). + if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT)) + return SDValue(); - // Strict math: use FABS to handle negative inputs correctly. - if (TLI.isFAbsFree(VT)) { - SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0)); - return DAG.getNode(ISD::FTRUNC, DL, VT, Abs); - } + // Collect potential clamp operations (innermost to outermost) and peel. + struct ClampOp { + unsigned Opcode; + SDValue Constant; + }; + SmallVector Clamps; + unsigned MinOp = IsUnsigned ? ISD::UMIN : ISD::SMIN; + unsigned MaxOp = IsUnsigned ? ISD::UMAX : ISD::SMAX; + SDValue IntVal = N->getOperand(0); + constexpr unsigned MaxClampLevels = 2; + for (unsigned Level = 0; Level < MaxClampLevels; ++Level) { + if (!IntVal.hasOneUse() || + (IntVal.getOpcode() != MinOp && IntVal.getOpcode() != MaxOp)) + break; + unsigned FPClampOp = + (IntVal.getOpcode() == MinOp) ? ISD::FMINNUM : ISD::FMAXNUM; + if (!TLI.isOperationLegal(FPClampOp, VT)) + return SDValue(); + auto *IntConstNode = dyn_cast(IntVal.getOperand(1)); + if (!IntConstNode) + return SDValue(); + APFloat FPConst(VT.getFltSemantics()); + APInt IntConst = IntConstNode->getAPIntValue(); + FPConst.convertFromAPInt(IntConst, IsSigned, APFloat::rmNearestTiesToEven); + // Verify roundtrip exactness. + APSInt RoundTrip(IntConst.getBitWidth(), IsUnsigned); + bool IsExact; + if (FPConst.convertToInteger(RoundTrip, APFloat::rmTowardZero, &IsExact) != + APFloat::opOK || + !IsExact || static_cast(RoundTrip) != IntConst) + return SDValue(); + Clamps.push_back({FPClampOp, DAG.getConstantFP(FPConst, DL, VT)}); + IntVal = IntVal.getOperand(0); } - return SDValue(); + // Check that the sequence ends with a FPTo[us]i of the right type. + unsigned FPToIntOp = IsUnsigned ? ISD::FP_TO_UINT : ISD::FP_TO_SINT; + if (IntVal.getOpcode() != FPToIntOp || + IntVal.getOperand(0).getValueType() != VT) + return SDValue(); + + SDValue Result = IntVal.getOperand(0); + if (IsUnsigned && !IsSignedZeroSafe && TLI.isFAbsFree(VT)) + Result = DAG.getNode(ISD::FABS, DL, VT, Result); + Result = DAG.getNode(ISD::FTRUNC, DL, VT, Result); + // Apply clamps, if any, in reverse order (innermost first). + for (auto I = Clamps.rbegin(), E = Clamps.rend(); I != E; ++I) + Result = DAG.getNode(I->Opcode, DL, VT, Result, I->Constant); + return Result; } SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { diff --git a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll new file mode 100644 index 0000000000000..9a8c555953611 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll @@ -0,0 +1,140 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefix=NO-SIGNED-ZEROS + +; Test folding of float->int->float roundtrips into float-only operations. +; The optimization could converts patterns like: +; sitofp(fptosi(x)) -> ftrunc(x) +; sitofp(smin(fptosi(x), C)) -> fminnum(ftrunc(x), (float)C) +; This is relevant for AArch64 as it avoids GPR bouncing and keeps computation in SIMD/FP registers. + +define float @test_signed_basic(float %x) { +; CHECK-LABEL: test_signed_basic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: scvtf s0, s0 +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_signed_basic: +; NO-SIGNED-ZEROS: // %bb.0: // %entry +; NO-SIGNED-ZEROS-NEXT: frintz s0, s0 +; NO-SIGNED-ZEROS-NEXT: ret +entry: + %i = fptosi float %x to i32 + %f = sitofp i32 %i to float + ret float %f +} + +define float @test_unsigned_basic(float %x) { +; CHECK-LABEL: test_unsigned_basic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: ucvtf s0, s0 +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic: +; NO-SIGNED-ZEROS: // %bb.0: // %entry +; NO-SIGNED-ZEROS-NEXT: frintz s0, s0 +; NO-SIGNED-ZEROS-NEXT: ret +entry: + %i = fptoui float %x to i32 + %f = uitofp i32 %i to float + ret float %f +} + +define float @test_signed_min_max(float %x) { +; CHECK-LABEL: test_signed_min_max: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs w9, s0 +; CHECK-NEXT: mov w8, #-512 // =0xfffffe00 +; CHECK-NEXT: cmn w9, #512 +; CHECK-NEXT: csel w8, w9, w8, gt +; CHECK-NEXT: mov w9, #1023 // =0x3ff +; CHECK-NEXT: cmp w8, #1023 +; CHECK-NEXT: csel w8, w8, w9, lt +; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_signed_min_max: +; NO-SIGNED-ZEROS: // %bb.0: // %entry +; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #196, lsl #24 +; NO-SIGNED-ZEROS-NEXT: frintz s0, s0 +; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000 +; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16 +; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1 +; NO-SIGNED-ZEROS-NEXT: fmov s1, w8 +; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1 +; NO-SIGNED-ZEROS-NEXT: ret +entry: + %i = fptosi float %x to i32 + %lower = call i32 @llvm.smax.i32(i32 %i, i32 -512) + %clamped = call i32 @llvm.smin.i32(i32 %lower, i32 1023) + %f = sitofp i32 %clamped to float + ret float %f +} + +define float @test_unsigned_min_max(float %x) { +; CHECK-LABEL: test_unsigned_min_max: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzu w9, s0 +; CHECK-NEXT: mov w8, #512 // =0x200 +; CHECK-NEXT: cmp w9, #512 +; CHECK-NEXT: csel w8, w9, w8, hi +; CHECK-NEXT: mov w9, #1023 // =0x3ff +; CHECK-NEXT: cmp w8, #1023 +; CHECK-NEXT: csel w8, w8, w9, lo +; CHECK-NEXT: ucvtf s0, w8 +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_unsigned_min_max: +; NO-SIGNED-ZEROS: // %bb.0: // %entry +; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #68, lsl #24 +; NO-SIGNED-ZEROS-NEXT: frintz s0, s0 +; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000 +; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16 +; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1 +; NO-SIGNED-ZEROS-NEXT: fmov s1, w8 +; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1 +; NO-SIGNED-ZEROS-NEXT: ret +entry: + %i = fptoui float %x to i32 + %lower = call i32 @llvm.umax.i32(i32 %i, i32 512) + %clamped = call i32 @llvm.umin.i32(i32 %lower, i32 1023) + %f = uitofp i32 %clamped to float + ret float %f +} + +; 16777217 is NOT exactly representable in f32. +define float @test_inexact_16777217(float %x) { +; CHECK-LABEL: test_inexact_16777217: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtzs w8, s0 +; CHECK-NEXT: mov w9, #16777216 // =0x1000000 +; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: mov w9, #1 // =0x1 +; CHECK-NEXT: movk w9, #256, lsl #16 +; CHECK-NEXT: csel w8, w8, w9, le +; CHECK-NEXT: scvtf s0, w8 +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_inexact_16777217: +; NO-SIGNED-ZEROS: // %bb.0: // %entry +; NO-SIGNED-ZEROS-NEXT: fcvtzs w8, s0 +; NO-SIGNED-ZEROS-NEXT: mov w9, #16777216 // =0x1000000 +; NO-SIGNED-ZEROS-NEXT: cmp w8, w9 +; NO-SIGNED-ZEROS-NEXT: mov w9, #1 // =0x1 +; NO-SIGNED-ZEROS-NEXT: movk w9, #256, lsl #16 +; NO-SIGNED-ZEROS-NEXT: csel w8, w8, w9, le +; NO-SIGNED-ZEROS-NEXT: scvtf s0, w8 +; NO-SIGNED-ZEROS-NEXT: ret +entry: + %i = fptosi float %x to i32 + %clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217) + %f = sitofp i32 %clamped to float + ret float %f +} + +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32)