From 51bf419075bc7a1e2f8d1e109893c844a03a2ce6 Mon Sep 17 00:00:00 2001 From: Yi-Chi Lee Date: Sun, 28 Sep 2025 10:41:29 -0500 Subject: [PATCH 1/5] [DAGCombiner] Extend FP-to-Int cast without requiring nsz --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 37 ++++++++++++++++--- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a6ba6e518899f..d3798eedc82a4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18869,20 +18869,45 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, // FIXME: We should be able to use node-level FMF here. // TODO: If strict math, should we use FABS (+ range check for signed cast)? EVT VT = N->getValueType(0); - if (!TLI.isOperationLegal(ISD::FTRUNC, VT) || - !DAG.getTarget().Options.NoSignedZerosFPMath) + if (!TLI.isOperationLegal(ISD::FTRUNC, VT)) return SDValue(); // fptosi/fptoui round towards zero, so converting from FP to integer and // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X SDValue N0 = N->getOperand(0); if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT && - N0.getOperand(0).getValueType() == VT) - return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + N0.getOperand(0).getValueType() == VT) { + if (DAG.getTarget().Options.NoSignedZerosFPMath) + return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + + unsigned IntWidth = N0.getValueSizeInBits(); + APInt APMax = APInt::getSignedMaxValue(IntWidth); + APInt APMin = APInt::getSignedMinValue(IntWidth); + + APFloat MaxAPF(VT.getFltSemantics()); + MaxAPF.convertFromAPInt(APMax, true, APFloat::rmTowardZero); + APFloat MinAPF(VT.getFltSemantics()); + MinAPF.convertFromAPInt(APMin, true, APFloat::rmTowardZero); + + SDValue MaxFP = DAG.getConstantFP(MaxAPF, DL, VT); + SDValue MinFP = DAG.getConstantFP(MinAPF, DL, VT); + + SDValue Clamped = DAG.getNode(ISD::FMINNUM, DL, VT, + DAG.getNode(ISD::FMAXNUM, DL, VT, N0->getOperand(0), MinFP), + MaxFP); + return DAG.getNode(ISD::FTRUNC, DL, VT, Clamped); + } if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT && - N0.getOperand(0).getValueType() == VT) - return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + N0.getOperand(0).getValueType() == VT) { + if (DAG.getTarget().Options.NoSignedZerosFPMath) + return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + + if (TLI.isFAbsFree(VT)) { + SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0)); + return DAG.getNode(ISD::FTRUNC, DL, VT, Abs); + } + } return SDValue(); } From cccc07319d26ed660107eaa53bb363547ef09c43 Mon Sep 17 00:00:00 2001 From: Yi-Chi Lee Date: Sun, 28 Sep 2025 14:46:41 -0500 Subject: [PATCH 2/5] [DAGCombiner] Modify the comment to fit the current implementation and apply clang-format --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d3798eedc82a4..65cea64e0982d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18862,12 +18862,15 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) { static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const TargetLowering &TLI) { - // We only do this if the target has legal ftrunc. Otherwise, we'd likely be - // replacing casts with a libcall. We also must be allowed to ignore -0.0 - // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer - // conversions would return +0.0. + // We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc. + // If NoSignedZerosFPMath is enabled, this is a direct replacement. + // Otherwise, for strict math, we must handle edge cases: + // 1. For signed conversions, clamp out-of-range values to the valid + // integer range before the trunc. + // 2. For unsigned conversions, use FABS. A negative float becomes integer 0, + // which must convert back to +0.0. FTRUNC on its own could produce -0.0. + // FIXME: We should be able to use node-level FMF here. - // TODO: If strict math, should we use FABS (+ range check for signed cast)? EVT VT = N->getValueType(0); if (!TLI.isOperationLegal(ISD::FTRUNC, VT)) return SDValue(); @@ -18880,6 +18883,7 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, if (DAG.getTarget().Options.NoSignedZerosFPMath) return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + // Strict math: clamp to the signed integer range before truncating. unsigned IntWidth = N0.getValueSizeInBits(); APInt APMax = APInt::getSignedMaxValue(IntWidth); APInt APMin = APInt::getSignedMinValue(IntWidth); @@ -18892,9 +18896,9 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, SDValue MaxFP = DAG.getConstantFP(MaxAPF, DL, VT); SDValue MinFP = DAG.getConstantFP(MinAPF, DL, VT); - SDValue Clamped = DAG.getNode(ISD::FMINNUM, DL, VT, - DAG.getNode(ISD::FMAXNUM, DL, VT, N0->getOperand(0), MinFP), - MaxFP); + SDValue Clamped = DAG.getNode( + ISD::FMINNUM, DL, VT, + DAG.getNode(ISD::FMAXNUM, DL, VT, N0->getOperand(0), MinFP), MaxFP); return DAG.getNode(ISD::FTRUNC, DL, VT, Clamped); } @@ -18903,6 +18907,7 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, if (DAG.getTarget().Options.NoSignedZerosFPMath) return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); + // Strict math: use FABS to handle negative inputs correctly. if (TLI.isFAbsFree(VT)) { SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0)); return DAG.getNode(ISD::FTRUNC, DL, VT, Abs); From 37413f35756e67d43d5b3f54c0b0e94670cca5cc Mon Sep 17 00:00:00 2001 From: Yi-Chi Lee Date: Sun, 28 Sep 2025 22:19:06 -0500 Subject: [PATCH 3/5] update testcases --- llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll | 49 ++++++++-- .../sve-streaming-mode-cvt-fp-int-fp.ll | 98 ++++++++++++++----- .../amdgpu-simplify-libcall-pow-codegen.ll | 37 +++---- 3 files changed, 133 insertions(+), 51 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll index 1207de746894b..f68188fdf54ce 100644 --- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll @@ -4,8 +4,13 @@ define double @t1(double %x) { ; CHECK-LABEL: t1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzs d0, d0 -; CHECK-NEXT: scvtf d0, d0 +; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff +; CHECK-NEXT: fmaxnm d0, d0, d1 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: fminnm d0, d0, d1 +; CHECK-NEXT: frintz d0, d0 ; CHECK-NEXT: ret entry: %conv = fptosi double %x to i64 @@ -16,8 +21,12 @@ entry: define float @t2(float %x) { ; CHECK-LABEL: t2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzs s0, s0 -; CHECK-NEXT: scvtf s0, s0 +; CHECK-NEXT: movi v1.2s, #207, lsl #24 +; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff +; CHECK-NEXT: fmaxnm s0, s0, s1 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fminnm s0, s0, s1 +; CHECK-NEXT: frintz s0, s0 ; CHECK-NEXT: ret entry: %conv = fptosi float %x to i32 @@ -28,8 +37,13 @@ entry: define half @t3(half %x) { ; CHECK-LABEL: t3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fcvtzs h0, h0 -; CHECK-NEXT: scvtf h0, h0 +; CHECK-NEXT: mov w8, #64511 // =0xfbff +; CHECK-NEXT: fmov h1, w8 +; CHECK-NEXT: mov w8, #31743 // =0x7bff +; CHECK-NEXT: fmaxnm h0, h0, h1 +; CHECK-NEXT: fmov h1, w8 +; CHECK-NEXT: fminnm h0, h0, h1 +; CHECK-NEXT: frintz h0, h0 ; CHECK-NEXT: ret entry: %conv = fptosi half %x to i32 @@ -170,8 +184,14 @@ entry: define i64 @tests_f64_multiuse(double %x) { ; CHECK-LABEL: tests_f64_multiuse: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff +; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: fcvtzs x8, d0 -; CHECK-NEXT: scvtf d1, x8 +; CHECK-NEXT: fmaxnm d1, d0, d1 +; CHECK-NEXT: fminnm d1, d1, d2 +; CHECK-NEXT: frintz d1, d1 ; CHECK-NEXT: fcmp d0, d1 ; CHECK-NEXT: csel x0, x8, xzr, eq ; CHECK-NEXT: ret @@ -186,8 +206,13 @@ entry: define i32 @tests_f32_multiuse(float %x) { ; CHECK-LABEL: tests_f32_multiuse: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v1.2s, #207, lsl #24 +; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff +; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fcvtzs w8, s0 -; CHECK-NEXT: scvtf s1, w8 +; CHECK-NEXT: fmaxnm s1, s0, s1 +; CHECK-NEXT: fminnm s1, s1, s2 +; CHECK-NEXT: frintz s1, s1 ; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: csel w0, w8, wzr, eq ; CHECK-NEXT: ret @@ -202,8 +227,14 @@ entry: define i32 @tests_f16_multiuse(half %x) { ; CHECK-LABEL: tests_f16_multiuse: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #64511 // =0xfbff +; CHECK-NEXT: fmov h1, w8 +; CHECK-NEXT: mov w8, #31743 // =0x7bff +; CHECK-NEXT: fmov h2, w8 ; CHECK-NEXT: fcvtzs w8, h0 -; CHECK-NEXT: scvtf h1, w8 +; CHECK-NEXT: fmaxnm h1, h0, h1 +; CHECK-NEXT: fminnm h1, h1, h2 +; CHECK-NEXT: frintz h1, h1 ; CHECK-NEXT: fcmp h0, h1 ; CHECK-NEXT: csel w0, w8, wzr, eq ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll index f4ae66a3b2259..bed62a428939e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll @@ -9,23 +9,35 @@ target triple = "aarch64-unknown-linux-gnu" define double @t1(double %x) { ; CHECK-LABEL: t1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: scvtf z0.d, p0/m, z0.d -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff +; CHECK-NEXT: fmaxnm d0, d0, d1 +; CHECK-NEXT: fmov d1, x8 +; CHECK-NEXT: fminnm d0, d0, d1 +; CHECK-NEXT: frintz d0, d0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t1: ; USE-NEON-NO-GPRS: // %bb.0: // %entry -; USE-NEON-NO-GPRS-NEXT: fcvtzs d0, d0 -; USE-NEON-NO-GPRS-NEXT: scvtf d0, d0 +; USE-NEON-NO-GPRS-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 +; USE-NEON-NO-GPRS-NEXT: fmov d1, x8 +; USE-NEON-NO-GPRS-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff +; USE-NEON-NO-GPRS-NEXT: fmaxnm d0, d0, d1 +; USE-NEON-NO-GPRS-NEXT: fmov d1, x8 +; USE-NEON-NO-GPRS-NEXT: fminnm d0, d0, d1 +; USE-NEON-NO-GPRS-NEXT: frintz d0, d0 ; USE-NEON-NO-GPRS-NEXT: ret ; ; NONEON-NOSVE-LABEL: t1: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: fcvtzs x8, d0 -; NONEON-NOSVE-NEXT: scvtf d0, x8 +; NONEON-NOSVE-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff +; NONEON-NOSVE-NEXT: fmaxnm d0, d0, d1 +; NONEON-NOSVE-NEXT: fmov d1, x8 +; NONEON-NOSVE-NEXT: fminnm d0, d0, d1 +; NONEON-NOSVE-NEXT: frintz d0, d0 ; NONEON-NOSVE-NEXT: ret entry: %conv = fptosi double %x to i64 @@ -36,23 +48,35 @@ entry: define float @t2(float %x) { ; CHECK-LABEL: t2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s -; CHECK-NEXT: scvtf z0.s, p0/m, z0.s -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 +; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff +; CHECK-NEXT: fmaxnm s0, s0, s1 +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: fminnm s0, s0, s1 +; CHECK-NEXT: frintz s0, s0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t2: ; USE-NEON-NO-GPRS: // %bb.0: // %entry -; USE-NEON-NO-GPRS-NEXT: fcvtzs s0, s0 -; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0 +; USE-NEON-NO-GPRS-NEXT: mov w8, #-822083584 // =0xcf000000 +; USE-NEON-NO-GPRS-NEXT: fmov s1, w8 +; USE-NEON-NO-GPRS-NEXT: mov w8, #1325400063 // =0x4effffff +; USE-NEON-NO-GPRS-NEXT: fmaxnm s0, s0, s1 +; USE-NEON-NO-GPRS-NEXT: fmov s1, w8 +; USE-NEON-NO-GPRS-NEXT: fminnm s0, s0, s1 +; USE-NEON-NO-GPRS-NEXT: frintz s0, s0 ; USE-NEON-NO-GPRS-NEXT: ret ; ; NONEON-NOSVE-LABEL: t2: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: fcvtzs w8, s0 -; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: mov w8, #-822083584 // =0xcf000000 +; NONEON-NOSVE-NEXT: fmov s1, w8 +; NONEON-NOSVE-NEXT: mov w8, #1325400063 // =0x4effffff +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fmov s1, w8 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: frintz s0, s0 ; NONEON-NOSVE-NEXT: ret entry: %conv = fptosi float %x to i32 @@ -63,18 +87,36 @@ entry: define half @t3(half %x) { ; CHECK-LABEL: t3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 -; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h -; CHECK-NEXT: scvtf z0.h, p0/m, z0.s -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: fmaxnm h0, h0, h1 +; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: fminnm h0, h0, h1 +; CHECK-NEXT: frintz h0, h0 ; CHECK-NEXT: ret ; +; USE-NEON-NO-GPRS-LABEL: t3: +; USE-NEON-NO-GPRS: // %bb.0: // %entry +; USE-NEON-NO-GPRS-NEXT: adrp x8, .LCPI2_0 +; USE-NEON-NO-GPRS-NEXT: ldr h1, [x8, :lo12:.LCPI2_0] +; USE-NEON-NO-GPRS-NEXT: adrp x8, .LCPI2_1 +; USE-NEON-NO-GPRS-NEXT: fmaxnm h0, h0, h1 +; USE-NEON-NO-GPRS-NEXT: ldr h1, [x8, :lo12:.LCPI2_1] +; USE-NEON-NO-GPRS-NEXT: fminnm h0, h0, h1 +; USE-NEON-NO-GPRS-NEXT: frintz h0, h0 +; USE-NEON-NO-GPRS-NEXT: ret +; ; NONEON-NOSVE-LABEL: t3: ; NONEON-NOSVE: // %bb.0: // %entry ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvtzs w8, s0 -; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: mov w8, #-822083584 // =0xcf000000 +; NONEON-NOSVE-NEXT: fmov s1, w8 +; NONEON-NOSVE-NEXT: mov w8, #1325400063 // =0x4effffff +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fmov s1, w8 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: frintz s0, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: ret entry: @@ -147,6 +189,12 @@ define half @t6(half %x) { ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; +; USE-NEON-NO-GPRS-LABEL: t6: +; USE-NEON-NO-GPRS: // %bb.0: // %entry +; USE-NEON-NO-GPRS-NEXT: fcvtzu h0, h0 +; USE-NEON-NO-GPRS-NEXT: ucvtf h0, h0 +; USE-NEON-NO-GPRS-NEXT: ret +; ; NONEON-NOSVE-LABEL: t6: ; NONEON-NOSVE: // %bb.0: // %entry ; NONEON-NOSVE-NEXT: fcvt s0, h0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 0329f23ea434f..85f99698951b2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -60,15 +60,16 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) { ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 ; CHECK-NEXT: v_log_f16_e64 v3, |v0| ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 -; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1 -; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 -; CHECK-NEXT: v_and_b32_e32 v0, v1, v0 -; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2 -; CHECK-NEXT: v_exp_f16_e32 v2, v2 -; CHECK-NEXT: v_or_b32_e32 v0, v0, v2 +; CHECK-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CHECK-NEXT: v_max_f16_e32 v1, 0xfbff, v1 +; CHECK-NEXT: v_min_f16_e32 v1, 0x7bff, v1 +; CHECK-NEXT: v_trunc_f16_e32 v1, v1 +; CHECK-NEXT: v_cvt_i32_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_f16_e32 v1, v3, v1 +; CHECK-NEXT: v_exp_f16_e32 v1, v1 +; CHECK-NEXT: v_lshlrev_b16_e32 v2, 15, v2 +; CHECK-NEXT: v_and_b32_e32 v0, v2, v0 +; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to half %pow = tail call fast half @_Z3powDhDh(half %x, half %y) @@ -79,28 +80,30 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; CHECK-LABEL: test_pow_fast_f32__integral_y: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc -; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 ; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 ; CHECK-NEXT: v_log_f32_e32 v3, v3 +; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 -; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 +; CHECK-NEXT: v_max_f32_e32 v3, 0xcf000000, v1 +; CHECK-NEXT: v_min_f32_e32 v3, 0x4effffff, v3 +; CHECK-NEXT: v_trunc_f32_e32 v3, v3 +; CHECK-NEXT: v_mul_f32_e32 v4, v2, v3 ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 -; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4 ; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000 -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 -; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc +; CHECK-NEXT: v_fma_f32 v2, v2, v3, v4 ; CHECK-NEXT: v_exp_f32_e32 v2, v2 +; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 ; CHECK-NEXT: v_not_b32_e32 v3, 63 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 ; CHECK-NEXT: v_ldexp_f32 v2, v2, v3 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 ; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to float From b12708128e25a1540a4476ac89367e144d96e8a9 Mon Sep 17 00:00:00 2001 From: Yi-Chi Lee Date: Mon, 29 Sep 2025 11:23:43 -0500 Subject: [PATCH 4/5] [DAGCombiner] Fold fp-uint-fp to fabs + ftrunc --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 +-- llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll | 49 +--- .../sve-streaming-mode-cvt-fp-int-fp.ll | 90 +++----- .../amdgpu-simplify-libcall-pow-codegen.ll | 37 ++-- llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll | 209 ++++++++++++++++++ 5 files changed, 265 insertions(+), 145 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 65cea64e0982d..3f6ac9d1e6404 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18865,10 +18865,9 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, // We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc. // If NoSignedZerosFPMath is enabled, this is a direct replacement. // Otherwise, for strict math, we must handle edge cases: - // 1. For signed conversions, clamp out-of-range values to the valid - // integer range before the trunc. - // 2. For unsigned conversions, use FABS. A negative float becomes integer 0, - // which must convert back to +0.0. FTRUNC on its own could produce -0.0. + // 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0 + // as example, it first becomes integer 0, and is converted back to +0.0. + // FTRUNC on its own could produce -0.0. // FIXME: We should be able to use node-level FMF here. EVT VT = N->getValueType(0); @@ -18882,24 +18881,6 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, N0.getOperand(0).getValueType() == VT) { if (DAG.getTarget().Options.NoSignedZerosFPMath) return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0)); - - // Strict math: clamp to the signed integer range before truncating. - unsigned IntWidth = N0.getValueSizeInBits(); - APInt APMax = APInt::getSignedMaxValue(IntWidth); - APInt APMin = APInt::getSignedMinValue(IntWidth); - - APFloat MaxAPF(VT.getFltSemantics()); - MaxAPF.convertFromAPInt(APMax, true, APFloat::rmTowardZero); - APFloat MinAPF(VT.getFltSemantics()); - MinAPF.convertFromAPInt(APMin, true, APFloat::rmTowardZero); - - SDValue MaxFP = DAG.getConstantFP(MaxAPF, DL, VT); - SDValue MinFP = DAG.getConstantFP(MinAPF, DL, VT); - - SDValue Clamped = DAG.getNode( - ISD::FMINNUM, DL, VT, - DAG.getNode(ISD::FMAXNUM, DL, VT, N0->getOperand(0), MinFP), MaxFP); - return DAG.getNode(ISD::FTRUNC, DL, VT, Clamped); } if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT && diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll index f68188fdf54ce..1207de746894b 100644 --- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll @@ -4,13 +4,8 @@ define double @t1(double %x) { ; CHECK-LABEL: t1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: fmaxnm d0, d0, d1 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: fminnm d0, d0, d1 -; CHECK-NEXT: frintz d0, d0 +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: scvtf d0, d0 ; CHECK-NEXT: ret entry: %conv = fptosi double %x to i64 @@ -21,12 +16,8 @@ entry: define float @t2(float %x) { ; CHECK-LABEL: t2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2s, #207, lsl #24 -; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff -; CHECK-NEXT: fmaxnm s0, s0, s1 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: fminnm s0, s0, s1 -; CHECK-NEXT: frintz s0, s0 +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: scvtf s0, s0 ; CHECK-NEXT: ret entry: %conv = fptosi float %x to i32 @@ -37,13 +28,8 @@ entry: define half @t3(half %x) { ; CHECK-LABEL: t3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #64511 // =0xfbff -; CHECK-NEXT: fmov h1, w8 -; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: fmaxnm h0, h0, h1 -; CHECK-NEXT: fmov h1, w8 -; CHECK-NEXT: fminnm h0, h0, h1 -; CHECK-NEXT: frintz h0, h0 +; CHECK-NEXT: fcvtzs h0, h0 +; CHECK-NEXT: scvtf h0, h0 ; CHECK-NEXT: ret entry: %conv = fptosi half %x to i32 @@ -184,14 +170,8 @@ entry: define i64 @tests_f64_multiuse(double %x) { ; CHECK-LABEL: tests_f64_multiuse: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: fmov d2, x8 ; CHECK-NEXT: fcvtzs x8, d0 -; CHECK-NEXT: fmaxnm d1, d0, d1 -; CHECK-NEXT: fminnm d1, d1, d2 -; CHECK-NEXT: frintz d1, d1 +; CHECK-NEXT: scvtf d1, x8 ; CHECK-NEXT: fcmp d0, d1 ; CHECK-NEXT: csel x0, x8, xzr, eq ; CHECK-NEXT: ret @@ -206,13 +186,8 @@ entry: define i32 @tests_f32_multiuse(float %x) { ; CHECK-LABEL: tests_f32_multiuse: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: movi v1.2s, #207, lsl #24 -; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff -; CHECK-NEXT: fmov s2, w8 ; CHECK-NEXT: fcvtzs w8, s0 -; CHECK-NEXT: fmaxnm s1, s0, s1 -; CHECK-NEXT: fminnm s1, s1, s2 -; CHECK-NEXT: frintz s1, s1 +; CHECK-NEXT: scvtf s1, w8 ; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: csel w0, w8, wzr, eq ; CHECK-NEXT: ret @@ -227,14 +202,8 @@ entry: define i32 @tests_f16_multiuse(half %x) { ; CHECK-LABEL: tests_f16_multiuse: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #64511 // =0xfbff -; CHECK-NEXT: fmov h1, w8 -; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: fmov h2, w8 ; CHECK-NEXT: fcvtzs w8, h0 -; CHECK-NEXT: fmaxnm h1, h0, h1 -; CHECK-NEXT: fminnm h1, h1, h2 -; CHECK-NEXT: frintz h1, h1 +; CHECK-NEXT: scvtf h1, w8 ; CHECK-NEXT: fcmp h0, h1 ; CHECK-NEXT: csel w0, w8, wzr, eq ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll index bed62a428939e..4ad5b38b256fe 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll @@ -9,35 +9,23 @@ target triple = "aarch64-unknown-linux-gnu" define double @t1(double %x) { ; CHECK-LABEL: t1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: fmaxnm d0, d0, d1 -; CHECK-NEXT: fmov d1, x8 -; CHECK-NEXT: fminnm d0, d0, d1 -; CHECK-NEXT: frintz d0, d0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t1: ; USE-NEON-NO-GPRS: // %bb.0: // %entry -; USE-NEON-NO-GPRS-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; USE-NEON-NO-GPRS-NEXT: fmov d1, x8 -; USE-NEON-NO-GPRS-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; USE-NEON-NO-GPRS-NEXT: fmaxnm d0, d0, d1 -; USE-NEON-NO-GPRS-NEXT: fmov d1, x8 -; USE-NEON-NO-GPRS-NEXT: fminnm d0, d0, d1 -; USE-NEON-NO-GPRS-NEXT: frintz d0, d0 +; USE-NEON-NO-GPRS-NEXT: fcvtzs d0, d0 +; USE-NEON-NO-GPRS-NEXT: scvtf d0, d0 ; USE-NEON-NO-GPRS-NEXT: ret ; ; NONEON-NOSVE-LABEL: t1: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; NONEON-NOSVE-NEXT: fmaxnm d0, d0, d1 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: fminnm d0, d0, d1 -; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: fcvtzs x8, d0 +; NONEON-NOSVE-NEXT: scvtf d0, x8 ; NONEON-NOSVE-NEXT: ret entry: %conv = fptosi double %x to i64 @@ -48,35 +36,23 @@ entry: define float @t2(float %x) { ; CHECK-LABEL: t2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff -; CHECK-NEXT: fmaxnm s0, s0, s1 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: fminnm s0, s0, s1 -; CHECK-NEXT: frintz s0, s0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t2: ; USE-NEON-NO-GPRS: // %bb.0: // %entry -; USE-NEON-NO-GPRS-NEXT: mov w8, #-822083584 // =0xcf000000 -; USE-NEON-NO-GPRS-NEXT: fmov s1, w8 -; USE-NEON-NO-GPRS-NEXT: mov w8, #1325400063 // =0x4effffff -; USE-NEON-NO-GPRS-NEXT: fmaxnm s0, s0, s1 -; USE-NEON-NO-GPRS-NEXT: fmov s1, w8 -; USE-NEON-NO-GPRS-NEXT: fminnm s0, s0, s1 -; USE-NEON-NO-GPRS-NEXT: frintz s0, s0 +; USE-NEON-NO-GPRS-NEXT: fcvtzs s0, s0 +; USE-NEON-NO-GPRS-NEXT: scvtf s0, s0 ; USE-NEON-NO-GPRS-NEXT: ret ; ; NONEON-NOSVE-LABEL: t2: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: mov w8, #-822083584 // =0xcf000000 -; NONEON-NOSVE-NEXT: fmov s1, w8 -; NONEON-NOSVE-NEXT: mov w8, #1325400063 // =0x4effffff -; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fmov s1, w8 -; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 -; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: scvtf s0, w8 ; NONEON-NOSVE-NEXT: ret entry: %conv = fptosi float %x to i32 @@ -87,36 +63,24 @@ entry: define half @t3(half %x) { ; CHECK-LABEL: t3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: adrp x8, .LCPI2_1 -; CHECK-NEXT: fmaxnm h0, h0, h1 -; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI2_1] -; CHECK-NEXT: fminnm h0, h0, h1 -; CHECK-NEXT: frintz h0, h0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 ; CHECK-NEXT: ret ; ; USE-NEON-NO-GPRS-LABEL: t3: ; USE-NEON-NO-GPRS: // %bb.0: // %entry -; USE-NEON-NO-GPRS-NEXT: adrp x8, .LCPI2_0 -; USE-NEON-NO-GPRS-NEXT: ldr h1, [x8, :lo12:.LCPI2_0] -; USE-NEON-NO-GPRS-NEXT: adrp x8, .LCPI2_1 -; USE-NEON-NO-GPRS-NEXT: fmaxnm h0, h0, h1 -; USE-NEON-NO-GPRS-NEXT: ldr h1, [x8, :lo12:.LCPI2_1] -; USE-NEON-NO-GPRS-NEXT: fminnm h0, h0, h1 -; USE-NEON-NO-GPRS-NEXT: frintz h0, h0 +; USE-NEON-NO-GPRS-NEXT: fcvtzs h0, h0 +; USE-NEON-NO-GPRS-NEXT: scvtf h0, h0 ; USE-NEON-NO-GPRS-NEXT: ret ; ; NONEON-NOSVE-LABEL: t3: ; NONEON-NOSVE: // %bb.0: // %entry ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov w8, #-822083584 // =0xcf000000 -; NONEON-NOSVE-NEXT: fmov s1, w8 -; NONEON-NOSVE-NEXT: mov w8, #1325400063 // =0x4effffff -; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fmov s1, w8 -; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 -; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: scvtf s0, w8 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll index 85f99698951b2..0329f23ea434f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll @@ -60,16 +60,15 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) { ; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 ; CHECK-NEXT: v_log_f16_e64 v3, |v0| ; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CHECK-NEXT: v_cvt_f32_f16_e32 v2, v1 -; CHECK-NEXT: v_max_f16_e32 v1, 0xfbff, v1 -; CHECK-NEXT: v_min_f16_e32 v1, 0x7bff, v1 -; CHECK-NEXT: v_trunc_f16_e32 v1, v1 -; CHECK-NEXT: v_cvt_i32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f16_e32 v1, v3, v1 -; CHECK-NEXT: v_exp_f16_e32 v1, v1 -; CHECK-NEXT: v_lshlrev_b16_e32 v2, 15, v2 -; CHECK-NEXT: v_and_b32_e32 v0, v2, v0 -; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 +; CHECK-NEXT: v_cvt_f32_i32_e32 v2, v1 +; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1 +; CHECK-NEXT: v_and_b32_e32 v0, v1, v0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CHECK-NEXT: v_mul_f16_e32 v2, v3, v2 +; CHECK-NEXT: v_exp_f16_e32 v2, v2 +; CHECK-NEXT: v_or_b32_e32 v0, v0, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to half %pow = tail call fast half @_Z3powDhDh(half %x, half %y) @@ -80,30 +79,28 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) { ; CHECK-LABEL: test_pow_fast_f32__integral_y: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 ; CHECK-NEXT: s_mov_b32 s4, 0x800000 ; CHECK-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 32, vcc +; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 ; CHECK-NEXT: v_ldexp_f32 v3, |v0|, v3 ; CHECK-NEXT: v_log_f32_e32 v3, v3 -; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0x42000000 +; CHECK-NEXT: v_cvt_f32_i32_e32 v4, v1 ; CHECK-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; CHECK-NEXT: v_sub_f32_e32 v2, v3, v2 -; CHECK-NEXT: v_max_f32_e32 v3, 0xcf000000, v1 -; CHECK-NEXT: v_min_f32_e32 v3, 0x4effffff, v3 -; CHECK-NEXT: v_trunc_f32_e32 v3, v3 -; CHECK-NEXT: v_mul_f32_e32 v4, v2, v3 ; CHECK-NEXT: s_mov_b32 s4, 0xc2fc0000 +; CHECK-NEXT: v_mul_f32_e32 v3, v2, v4 ; CHECK-NEXT: v_mov_b32_e32 v5, 0x42800000 -; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v4, 0, v5, vcc -; CHECK-NEXT: v_fma_f32 v2, v2, v3, v4 +; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3 +; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; CHECK-NEXT: v_fma_f32 v2, v2, v4, v3 ; CHECK-NEXT: v_exp_f32_e32 v2, v2 -; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v1 ; CHECK-NEXT: v_not_b32_e32 v3, 63 ; CHECK-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc -; CHECK-NEXT: v_ldexp_f32 v2, v2, v3 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; CHECK-NEXT: v_ldexp_f32 v2, v2, v3 ; CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 ; CHECK-NEXT: s_setpc_b64 s[30:31] %y = sitofp i32 %y.i to float diff --git a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll new file mode 100644 index 0000000000000..4087a76b18dc4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=amdgcn | FileCheck %s + +define amdgpu_kernel void @fptoui_f32_to_i16_to_f32(ptr addrspace(1) %out, float %x) { +; CHECK-LABEL: fptoui_f32_to_i16_to_f32: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_trunc_f32_e64 v0, |s6| +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +entry: + %ui = fptoui float %x to i16 + %fp = uitofp i16 %ui to float + store float %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f32_to_i32_to_f32(ptr addrspace(1) %out, float %x) { +; CHECK-LABEL: fptoui_f32_to_i32_to_f32: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_trunc_f32_e64 v0, |s6| +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +entry: + %ui = fptoui float %x to i32 + %fp = uitofp i32 %ui to float + store float %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f32_to_i64_to_f32(ptr addrspace(1) %out, float %x) { +; CHECK-LABEL: fptoui_f32_to_i64_to_f32: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_trunc_f32_e64 v0, |s6| +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +entry: + %ui = fptoui float %x to i64 + %fp = uitofp i64 %ui to float + store float %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f16_to_i16_to_f16(ptr addrspace(1) %out, half %x) { +; CHECK-LABEL: fptoui_f16_to_i16_to_f16: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cvt_f32_f16_e32 v0, s6 +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +entry: + %ui = fptoui half %x to i16 + %fp = uitofp i16 %ui to half + store half %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f16_to_i32_to_f16(ptr addrspace(1) %out, half %x) { +; CHECK-LABEL: fptoui_f16_to_i32_to_f16: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cvt_f32_f16_e64 v0, |s6| +; CHECK-NEXT: v_trunc_f32_e32 v0, v0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +entry: + %ui = fptoui half %x to i32 + %fp = uitofp i32 %ui to half + store half %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f16_to_i64_to_f16(ptr addrspace(1) %out, half %x) { +; CHECK-LABEL: fptoui_f16_to_i64_to_f16: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_cvt_f32_f16_e64 v0, |s6| +; CHECK-NEXT: v_trunc_f32_e32 v0, v0 +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +entry: + %ui = fptoui half %x to i64 + %fp = uitofp i64 %ui to half + store half %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f64_to_i16_to_f64(ptr addrspace(1) %out, double %x) { +; CHECK-LABEL: fptoui_f64_to_i16_to_f64: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s6, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, s0 +; CHECK-NEXT: s_mov_b32 s5, s1 +; CHECK-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CHECK-NEXT: s_endpgm +entry: + %ui = fptoui double %x to i16 + %fp = uitofp i16 %ui to double + store double %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f64_to_i32_to_f64(ptr addrspace(1) %out, double %x) { +; CHECK-LABEL: fptoui_f64_to_i32_to_f64: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s6, -1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, s0 +; CHECK-NEXT: s_mov_b32 s5, s1 +; CHECK-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; CHECK-NEXT: s_endpgm +entry: + %ui = fptoui double %x to i32 + %fp = uitofp i32 %ui to double + store double %fp, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, double %x) { +; CHECK-LABEL: fptoui_f64_to_i64_to_f64: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; CHECK-NEXT: s_mov_b32 s3, 0xf000 +; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: s_mov_b32 s9, 0xfffff +; CHECK-NEXT: v_not_b32_e32 v2, 31 +; CHECK-NEXT: v_mov_b32_e32 v0, -1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fefffff +; CHECK-NEXT: s_mov_b32 s10, 0 +; CHECK-NEXT: s_mov_b32 s11, 0xc1f00000 +; CHECK-NEXT: s_mov_b32 s8, s2 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s0, s4 +; CHECK-NEXT: s_mov_b32 s1, s5 +; CHECK-NEXT: s_bfe_u32 s4, s7, 0xb0014 +; CHECK-NEXT: s_and_b32 s12, s7, 0x80000000 +; CHECK-NEXT: s_add_i32 s13, s4, 0xfffffc01 +; CHECK-NEXT: s_lshr_b64 s[4:5], s[8:9], s13 +; CHECK-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; CHECK-NEXT: s_cmp_lt_i32 s13, 0 +; CHECK-NEXT: s_cselect_b32 s4, 0, s4 +; CHECK-NEXT: s_cselect_b32 s5, s12, s5 +; CHECK-NEXT: s_cmp_gt_i32 s13, 51 +; CHECK-NEXT: s_cselect_b32 s5, s7, s5 +; CHECK-NEXT: s_cselect_b32 s4, s6, s4 +; CHECK-NEXT: v_ldexp_f64 v[2:3], s[4:5], v2 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 +; CHECK-NEXT: v_mov_b32_e32 v5, s5 +; CHECK-NEXT: v_fract_f64_e32 v[6:7], v[2:3] +; CHECK-NEXT: v_min_f64 v[0:1], v[6:7], v[0:1] +; CHECK-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 3 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; CHECK-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] +; CHECK-NEXT: v_fma_f64 v[2:3], v[0:1], s[10:11], v[4:5] +; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] +; CHECK-NEXT: v_cvt_u32_f64_e32 v2, v[2:3] +; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; CHECK-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; CHECK-NEXT: v_cvt_f64_u32_e32 v[2:3], v2 +; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CHECK-NEXT: s_endpgm +entry: + %ui = fptoui double %x to i64 + %fp = uitofp i64 %ui to double + store double %fp, ptr addrspace(1) %out + ret void +} From 36447e7643d81e16160e2404887f413a105b4530 Mon Sep 17 00:00:00 2001 From: Yi-Chi Lee Date: Fri, 10 Oct 2025 09:23:16 -0500 Subject: [PATCH 5/5] update testcase (test with and without legal 16-bit operations) --- llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll | 359 ++++++++++++++-------- 1 file changed, 223 insertions(+), 136 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll index 4087a76b18dc4..49204f84acb85 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoui_uitofp.ll @@ -1,17 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc < %s -mtriple=amdgcn | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s define amdgpu_kernel void @fptoui_f32_to_i16_to_f32(ptr addrspace(1) %out, float %x) { -; CHECK-LABEL: fptoui_f32_to_i16_to_f32: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_trunc_f32_e64 v0, |s6| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm +; GFX6-LABEL: fptoui_f32_to_i16_to_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_trunc_f32_e64 v0, |s6| +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f32_to_i16_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e64 v1, |s2| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm entry: %ui = fptoui float %x to i16 %fp = uitofp i16 %ui to float @@ -20,16 +31,26 @@ entry: } define amdgpu_kernel void @fptoui_f32_to_i32_to_f32(ptr addrspace(1) %out, float %x) { -; CHECK-LABEL: fptoui_f32_to_i32_to_f32: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_trunc_f32_e64 v0, |s6| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm +; GFX6-LABEL: fptoui_f32_to_i32_to_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_trunc_f32_e64 v0, |s6| +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f32_to_i32_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e64 v1, |s2| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm entry: %ui = fptoui float %x to i32 %fp = uitofp i32 %ui to float @@ -38,16 +59,26 @@ entry: } define amdgpu_kernel void @fptoui_f32_to_i64_to_f32(ptr addrspace(1) %out, float %x) { -; CHECK-LABEL: fptoui_f32_to_i64_to_f32: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_trunc_f32_e64 v0, |s6| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm +; GFX6-LABEL: fptoui_f32_to_i64_to_f32: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_trunc_f32_e64 v0, |s6| +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f32_to_i64_to_f32: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f32_e64 v1, |s2| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm entry: %ui = fptoui float %x to i64 %fp = uitofp i64 %ui to float @@ -56,19 +87,30 @@ entry: } define amdgpu_kernel void @fptoui_f16_to_i16_to_f16(ptr addrspace(1) %out, half %x) { -; CHECK-LABEL: fptoui_f16_to_i16_to_f16: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_f16_e32 v0, s6 -; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 -; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm +; GFX6-LABEL: fptoui_f16_to_i16_to_f16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f16_to_i16_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e64 v1, |s2| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm entry: %ui = fptoui half %x to i16 %fp = uitofp i16 %ui to half @@ -77,18 +119,29 @@ entry: } define amdgpu_kernel void @fptoui_f16_to_i32_to_f16(ptr addrspace(1) %out, half %x) { -; CHECK-LABEL: fptoui_f16_to_i32_to_f16: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_f16_e64 v0, |s6| -; CHECK-NEXT: v_trunc_f32_e32 v0, v0 -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm +; GFX6-LABEL: fptoui_f16_to_i32_to_f16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: v_trunc_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f16_to_i32_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e64 v1, |s2| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm entry: %ui = fptoui half %x to i32 %fp = uitofp i32 %ui to half @@ -97,18 +150,29 @@ entry: } define amdgpu_kernel void @fptoui_f16_to_i64_to_f16(ptr addrspace(1) %out, half %x) { -; CHECK-LABEL: fptoui_f16_to_i64_to_f16: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s6, s[4:5], 0xb -; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_f16_e64 v0, |s6| -; CHECK-NEXT: v_trunc_f32_e32 v0, v0 -; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0 -; CHECK-NEXT: s_endpgm +; GFX6-LABEL: fptoui_f16_to_i64_to_f16: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |s0| +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: v_trunc_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f16_to_i64_to_f16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f16_e64 v1, |s2| +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm entry: %ui = fptoui half %x to i64 %fp = uitofp i64 %ui to half @@ -117,18 +181,25 @@ entry: } define amdgpu_kernel void @fptoui_f64_to_i16_to_f64(ptr addrspace(1) %out, double %x) { -; CHECK-LABEL: fptoui_f64_to_i16_to_f64: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CHECK-NEXT: s_mov_b32 s7, 0xf000 -; CHECK-NEXT: s_mov_b32 s6, -1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, s0 -; CHECK-NEXT: s_mov_b32 s5, s1 -; CHECK-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] -; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; CHECK-NEXT: s_endpgm +; GFX6-LABEL: fptoui_f64_to_i16_to_f64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f64_to_i16_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm entry: %ui = fptoui double %x to i16 %fp = uitofp i16 %ui to double @@ -137,18 +208,25 @@ entry: } define amdgpu_kernel void @fptoui_f64_to_i32_to_f64(ptr addrspace(1) %out, double %x) { -; CHECK-LABEL: fptoui_f64_to_i32_to_f64: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; CHECK-NEXT: s_mov_b32 s7, 0xf000 -; CHECK-NEXT: s_mov_b32 s6, -1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, s0 -; CHECK-NEXT: s_mov_b32 s5, s1 -; CHECK-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] -; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; CHECK-NEXT: s_endpgm +; GFX6-LABEL: fptoui_f64_to_i32_to_f64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_u32_f64_e32 v0, s[2:3] +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f64_to_i32_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm entry: %ui = fptoui double %x to i32 %fp = uitofp i32 %ui to double @@ -157,50 +235,59 @@ entry: } define amdgpu_kernel void @fptoui_f64_to_i64_to_f64(ptr addrspace(1) %out, double %x) { -; CHECK-LABEL: fptoui_f64_to_i64_to_f64: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; CHECK-NEXT: s_mov_b32 s3, 0xf000 -; CHECK-NEXT: s_mov_b32 s2, -1 -; CHECK-NEXT: s_mov_b32 s9, 0xfffff -; CHECK-NEXT: v_not_b32_e32 v2, 31 -; CHECK-NEXT: v_mov_b32_e32 v0, -1 -; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fefffff -; CHECK-NEXT: s_mov_b32 s10, 0 -; CHECK-NEXT: s_mov_b32 s11, 0xc1f00000 -; CHECK-NEXT: s_mov_b32 s8, s2 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s0, s4 -; CHECK-NEXT: s_mov_b32 s1, s5 -; CHECK-NEXT: s_bfe_u32 s4, s7, 0xb0014 -; CHECK-NEXT: s_and_b32 s12, s7, 0x80000000 -; CHECK-NEXT: s_add_i32 s13, s4, 0xfffffc01 -; CHECK-NEXT: s_lshr_b64 s[4:5], s[8:9], s13 -; CHECK-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] -; CHECK-NEXT: s_cmp_lt_i32 s13, 0 -; CHECK-NEXT: s_cselect_b32 s4, 0, s4 -; CHECK-NEXT: s_cselect_b32 s5, s12, s5 -; CHECK-NEXT: s_cmp_gt_i32 s13, 51 -; CHECK-NEXT: s_cselect_b32 s5, s7, s5 -; CHECK-NEXT: s_cselect_b32 s4, s6, s4 -; CHECK-NEXT: v_ldexp_f64 v[2:3], s[4:5], v2 -; CHECK-NEXT: v_mov_b32_e32 v4, s4 -; CHECK-NEXT: v_mov_b32_e32 v5, s5 -; CHECK-NEXT: v_fract_f64_e32 v[6:7], v[2:3] -; CHECK-NEXT: v_min_f64 v[0:1], v[6:7], v[0:1] -; CHECK-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 3 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CHECK-NEXT: v_add_f64 v[0:1], v[2:3], -v[0:1] -; CHECK-NEXT: v_fma_f64 v[2:3], v[0:1], s[10:11], v[4:5] -; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] -; CHECK-NEXT: v_cvt_u32_f64_e32 v2, v[2:3] -; CHECK-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; CHECK-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 -; CHECK-NEXT: v_cvt_f64_u32_e32 v[2:3], v2 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] -; CHECK-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; CHECK-NEXT: s_endpgm +; GFX6-LABEL: fptoui_f64_to_i64_to_f64: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s5, 0xfffff +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: v_not_b32_e32 v0, 31 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_u32 s7, s3, 0xb0014 +; GFX6-NEXT: s_addk_i32 s7, 0xfc01 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; GFX6-NEXT: s_and_b32 s8, s3, 0x80000000 +; GFX6-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5] +; GFX6-NEXT: s_cmp_lt_i32 s7, 0 +; GFX6-NEXT: s_cselect_b32 s4, 0, s4 +; GFX6-NEXT: s_cselect_b32 s5, s8, s5 +; GFX6-NEXT: s_cmp_gt_i32 s7, 51 +; GFX6-NEXT: s_cselect_b32 s3, s3, s5 +; GFX6-NEXT: s_cselect_b32 s2, s2, s4 +; GFX6-NEXT: v_ldexp_f64 v[0:1], s[2:3], v0 +; GFX6-NEXT: v_mov_b32_e32 v4, -1 +; GFX6-NEXT: v_fract_f64_e32 v[2:3], v[0:1] +; GFX6-NEXT: v_mov_b32_e32 v5, 0x3fefffff +; GFX6-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] +; GFX6-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 3 +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s2 +; GFX6-NEXT: s_mov_b32 s5, 0xc1f00000 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: v_fma_f64 v[2:3], v[0:1], s[4:5], v[2:3] +; GFX6-NEXT: v_cvt_u32_f64_e32 v0, v[0:1] +; GFX6-NEXT: v_cvt_u32_f64_e32 v2, v[2:3] +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s4, s0 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 +; GFX6-NEXT: v_cvt_f64_u32_e32 v[2:3], v2 +; GFX6-NEXT: s_mov_b32 s5, s1 +; GFX6-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32 +; GFX6-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX9-LABEL: fptoui_f64_to_i64_to_f64: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_trunc_f64_e64 v[0:1], |s[2:3]| +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm entry: %ui = fptoui double %x to i64 %fp = uitofp i64 %ui to double