[AMDGPU] Fix legalization of frem(-0.0, y) #70448

jayfoad · 2023-10-27T12:14:51Z

We legalized frem(x, y) -> x - y * trunc(x / y). When x is -0.0 this
evaluates to +0.0 but the result should be -0.0.

Fix this by legalizing to copysign(x - y * trunc(x / y), x).

We legalized frem(x, y) -> x - y * trunc(x / y). When x is -0.0 this evaluates to +0.0 but the result should be -0.0. Fix this by legalizing to copysign(x - y * trunc(x / y), x).

llvmbot · 2023-10-27T12:15:58Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-llvm-globalisel

Author: Jay Foad (jayfoad)

Changes

We legalized frem(x, y) -> x - y * trunc(x / y). When x is -0.0 this
evaluates to +0.0 but the result should be -0.0.

Fix this by legalizing to copysign(x - y * trunc(x / y), x).

Patch is 139.68 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70448.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+6-2)
(modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+13-1)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll (+164-21)
(modified) llvm/test/CodeGen/AMDGPU/frem.ll (+544-336)
(modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+8-4)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index adf4e0139e03c1d..28f6ad62f2c37ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2267,19 +2267,23 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
   return DAG.getMergeValues(Res, DL);
 }
 
-// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
+// (frem x, y) -> (fcopysign (fma (fneg (ftrunc (fdiv x, y))), y, x), x)
+// The fcopysign is only required to get the correct result -0.0 when x is -0.0
+// (and y is non-zero). With NSZ it can be dropped.
 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   EVT VT = Op.getValueType();
   auto Flags = Op->getFlags();
   SDValue X = Op.getOperand(0);
   SDValue Y = Op.getOperand(1);
+  bool NSZ = mayIgnoreSignedZero(Op);
 
   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
   SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
   // TODO: For f32 use FMAD instead if !hasFastFMA32?
-  return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
+  SDValue FMA = DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
+  return NSZ ? FMA : DAG.getNode(ISD::FCOPYSIGN , SL, VT, FMA, X);
 }
 
 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 3d70ed150df12f8..0775d8a3c2396f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2341,6 +2341,14 @@ bool AMDGPULegalizerInfo::legalizeFceil(
   return true;
 }
 
+static bool allowNoSignedZeros(const MachineFunction &MF, unsigned Flags) {
+  return (Flags & MachineInstr::FmNsz) ||
+      MF.getTarget().Options.NoSignedZerosFPMath;
+}
+
+// Legalize frem(x, y) -> copysign(x - y * trunc(x / y), x)
+// The copysign is only required to get the correct result -0.0 when x is -0.0
+// (and y is non-zero). With NSZ it can be dropped.
 bool AMDGPULegalizerInfo::legalizeFrem(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B) const {
@@ -2349,11 +2357,15 @@ bool AMDGPULegalizerInfo::legalizeFrem(
     Register Src1Reg = MI.getOperand(2).getReg();
     auto Flags = MI.getFlags();
     LLT Ty = MRI.getType(DstReg);
+    bool NSZ = allowNoSignedZeros(B.getMF(), Flags);
 
     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
-    B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
+    DstOp FMADst = NSZ ? DstOp(DstReg) : DstOp(Ty);
+    auto FMA = B.buildFMA(FMADst, Neg, Src1Reg, Src0Reg, Flags);
+    if (!NSZ)
+      B.buildFCopysign(DstReg, FMA, Src0Reg);
     MI.eraseFromParent();
     return true;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 0417b97a53c0ff3..da7fca95d5eef07 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -29,7 +29,10 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
 ; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT:    v_fma_f32 v1, -v2, v1, v0
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; CI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -46,12 +49,15 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
+; VI-NEXT:    s_and_b32 s0, s2, 0xffff8000
 ; VI-NEXT:    v_rcp_f32_e32 v2, v2
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
 ; VI-NEXT:    v_trunc_f16_e32 v0, v0
-; VI-NEXT:    v_fma_f16 v2, -v0, v1, s2
+; VI-NEXT:    v_fma_f16 v0, -v0, v1, s2
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    v_or_b32_e32 v2, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_short v[0:1], v2
@@ -126,7 +132,10 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    v_rcp_f32_e32 v2, v1
 ; CI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT:    v_fma_f32 v1, -v2, v1, v0
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; CI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
@@ -143,7 +152,10 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_rcp_f16_e32 v0, s0
 ; VI-NEXT:    v_mul_f16_e32 v0, s2, v0
 ; VI-NEXT:    v_trunc_f16_e32 v0, v0
-; VI-NEXT:    v_fma_f16 v2, -v0, s0, v1
+; VI-NEXT:    v_fma_f16 v0, -v0, s0, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    s_and_b32 s0, s2, 0xffff8000
+; VI-NEXT:    v_or_b32_e32 v2, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_short v[0:1], v2
@@ -178,11 +190,14 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; CI-NEXT:    s_and_b32 s0, s2, 0x80000000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_div_fixup_f32 v1, v1, v0, s2
 ; CI-NEXT:    v_trunc_f32_e32 v1, v1
 ; CI-NEXT:    v_fma_f32 v0, -v1, v0, s2
+; CI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; CI-NEXT:    v_or_b32_e32 v0, s0, v0
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -207,9 +222,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; VI-NEXT:    s_and_b32 s0, s2, 0x80000000
 ; VI-NEXT:    v_div_fixup_f32 v1, v1, v0, s2
 ; VI-NEXT:    v_trunc_f32_e32 v1, v1
-; VI-NEXT:    v_fma_f32 v2, -v1, v0, s2
+; VI-NEXT:    v_fma_f32 v0, -v1, v0, s2
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; VI-NEXT:    v_or_b32_e32 v2, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -282,6 +300,9 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; CI-NEXT:    v_trunc_f32_e32 v0, v0
 ; CI-NEXT:    v_fma_f32 v0, -v0, s0, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; CI-NEXT:    s_and_b32 s0, s2, 0x80000000
+; CI-NEXT:    v_or_b32_e32 v0, s0, v0
 ; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -297,7 +318,10 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_rcp_f32_e32 v0, s0
 ; VI-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; VI-NEXT:    v_trunc_f32_e32 v0, v0
-; VI-NEXT:    v_fma_f32 v2, -v0, s0, v1
+; VI-NEXT:    v_fma_f32 v0, -v0, s0, v1
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; VI-NEXT:    s_and_b32 s0, s2, 0x80000000
+; VI-NEXT:    v_or_b32_e32 v2, s0, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -325,6 +349,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3]
 ; CI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
+; CI-NEXT:    s_mov_b32 s0, 0
+; CI-NEXT:    s_brev_b32 s1, 1
+; CI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; CI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; CI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -336,6 +363,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
 ; CI-NEXT:    v_trunc_f64_e32 v[2:3], v[2:3]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT:    v_or_b32_e32 v0, s0, v0
+; CI-NEXT:    v_or_b32_e32 v1, s1, v1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -351,6 +381,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3]
 ; VI-NEXT:    v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    s_brev_b32 s1, 1
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; VI-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
 ; VI-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
 ; VI-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -364,6 +397,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; VI-NEXT:    v_or_b32_e32 v0, s0, v0
+; VI-NEXT:    v_or_b32_e32 v1, s1, v1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
    %r0 = load double, ptr addrspace(1) %in1, align 8
@@ -453,6 +489,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; CI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; CI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; CI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
+; CI-NEXT:    s_mov_b32 s0, 0
+; CI-NEXT:    s_brev_b32 s1, 1
+; CI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT:    v_or_b32_e32 v0, s0, v0
+; CI-NEXT:    v_or_b32_e32 v1, s1, v1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -476,8 +518,14 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
 ; VI-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
 ; VI-NEXT:    v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    s_brev_b32 s1, 1
+; VI-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; VI-NEXT:    v_or_b32_e32 v0, s0, v0
+; VI-NEXT:    v_or_b32_e32 v1, s1, v1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
@@ -515,7 +563,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
 ; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT:    v_fma_f32 v1, -v2, v1, v0
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; CI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s6
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
@@ -536,7 +587,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
 ; CI-NEXT:    v_trunc_f32_e32 v3, v3
-; CI-NEXT:    v_fma_f32 v1, -v3, v2, v1
+; CI-NEXT:    v_fma_f32 v2, -v3, v2, v1
+; CI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; CI-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; CI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -559,6 +613,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_rcp_f32_e32 v2, v2
 ; VI-NEXT:    s_lshr_b32 s1, s2, 16
 ; VI-NEXT:    v_rcp_f32_e32 v3, v3
+; VI-NEXT:    s_and_b32 s0, s2, 0xffff8000
 ; VI-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -566,13 +621,19 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_trunc_f16_e32 v0, v0
 ; VI-NEXT:    v_fma_f16 v0, -v0, v1, s2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s1
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; VI-NEXT:    v_or_b32_e32 v0, s0, v0
+; VI-NEXT:    s_and_b32 s0, s1, 0xffff8000
 ; VI-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-NEXT:    v_div_fixup_f16 v1, v1, v2, s1
 ; VI-NEXT:    v_trunc_f16_e32 v1, v1
 ; VI-NEXT:    v_fma_f16 v1, -v1, v2, s1
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; VI-NEXT:    v_or_b32_e32 v1, s0, v1
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_or_b32_e32 v2, v0, v1
+; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -614,7 +675,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, v0
 ; CI-NEXT:    v_trunc_f32_e32 v2, v2
-; CI-NEXT:    v_fma_f32 v0, -v2, v1, v0
+; CI-NEXT:    v_fma_f32 v1, -v2, v1, v0
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; CI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s8
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s10
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
@@ -633,7 +697,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
 ; CI-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
 ; CI-NEXT:    v_trunc_f32_e32 v3, v3
-; CI-NEXT:    v_fma_f32 v1, -v3, v2, v1
+; CI-NEXT:    v_fma_f32 v2, -v3, v2, v1
+; CI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v2
+; CI-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
+; CI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, s1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -651,7 +718,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_div_fmas_f32 v4, v4, v6, v7
 ; CI-NEXT:    v_div_fixup_f32 v4, v4, v3, v2
 ; CI-NEXT:    v_trunc_f32_e32 v4, v4
-; CI-NEXT:    v_fma_f32 v2, -v4, v3, v2
+; CI-NEXT:    v_fma_f32 v3, -v4, v3, v2
+; CI-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v3
+; CI-NEXT:    v_and_b32_e32 v2, 0x80000000, v2
+; CI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, s9
 ; CI-NEXT:    v_cvt_f32_f16_e32 v4, s11
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -673,7 +743,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_div_fixup_f32 v5, v5, v4, v3
 ; CI-NEXT:    v_trunc_f32_e32 v5, v5
-; CI-NEXT:    v_fma_f32 v3, -v5, v4, v3
+; CI-NEXT:    v_fma_f32 v4, -v5, v4, v3
+; CI-NEXT:    v_and_b32_e32 v4, 0x7fffffff, v4
+; CI-NEXT:    v_and_b32_e32 v3, 0x80000000, v3
+; CI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v1, v2, v1
@@ -708,30 +781,44 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s6
 ; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
 ; VI-NEXT:    s_lshr_b32 s7, s3, 16
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; VI-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_rcp_f32_e32 v5, v5
+; VI-NEXT:    s_and_b32 s0, s2, 0xffff8000
 ; VI-NEXT:    v_div_fixup_f16 v1, v1, v2, s6
 ; VI-NEXT:    v_trunc_f16_e32 v1, v1
 ; VI-NEXT:    v_fma_f16 v1, -v1, v2, s6
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    v_or_b32_e32 v0, s0, v0
+; VI-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; VI-NEXT:    s_and_b32 s0, s6, 0xffff8000
 ; VI-NEXT:    v_mul_f32_e32 v2, v2, v4
 ; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; VI-NEXT:    v_mov_b32_e32 v4, s9
+; VI-NEXT:    v_or_b32_e32 v1, s0, v1
+; VI-NEXT:    s_and_b32 s0, s3, 0xffff8000
 ; VI-NEXT:    v_div_fixup_f16 v2, v2, v3, s3
 ; VI-NEXT:    v_trunc_f16_e32 v2, v2
 ; VI-NEXT:    v_fma_f16 v2, -v2, v3, s3
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, s7
+; VI-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
+; VI-NEXT:    v_or_b32_e32 v2, s0, v2
+; VI-NEXT:    s_and_b32 s0, s7, 0xffff8000
 ; VI-NEXT:    v_mul_f32_e32 v3, v3, v5
 ; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_div_fixup_f16 v3, v3, v4, s7
 ; VI-NEXT:    v_trunc_f16_e32 v3, v3
 ; VI-NEXT:    v_fma_f16 v3, -v3, v4, s7
-; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
-; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; VI-NEXT:    v_or_b32_e32 v3, s0, v3
+; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v3
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -766,10 +853,13 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; CI-NEXT:    s_and_b32 s0, s2, 0x80000000
 ; CI-NEXT:    v_div_fixup_f32 v1, v1, v0, s2
 ; CI-NEXT:    v_trunc_f32_e32 v1, v1
 ; CI-NEXT:    v_fma_f32 v0, -v1, v0, s2
+; CI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_or_b32_e32 v0, s0, v0
 ; CI-NEXT:    v_div_scale_f32 v2, s[0:1], v1, v1, s3
 ; CI-NEXT:    v_div_scale_f32 v3, vcc, s3, v1, s3
 ; CI-NEXT:    v_rcp_f32_e32 v4, v2
@@ -782,11 +872,14 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; CI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; CI-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; CI-NEXT:    s_and_b32 s0, s3, 0x80000000
 ; CI-NEXT:    s_mov_b32 s6, -1
 ; CI-NEXT:    s_mov_b32 s7, 0xf000
 ; CI-NEXT:    v_div_fixup_f32 v2, v2, v1, s3
 ; CI-NEXT:    v_trunc_f32_e32 v2, v2
 ; CI-NEXT:    v_fma_f32 v1, -v2, v1, s3
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT:    v_or_b32_e32 v1, s0, v1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
 ;
@@ -811,10 +904,13 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_fma_f32 v1, -v1, v4, v2
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; VI-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
+; VI-NEXT:    s_and_b32 s0, s2, 0x80000000
 ; VI-NEXT:    v_div_fixup_f32 v1, v1, v0, s2
 ; VI-NEXT:    v_trunc_f32_e32 v1, v1
 ; VI-NEXT:    v_fma_f32 v0, -v1, v0, s2
+; VI-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_or_b32_e32 v0, s0, v0
 ; VI-NEXT:    v_div_scale_f32 v2, s[0:1], v1, v1, s3
 ; VI-NEXT:    v_div_scale_f32 v3, vcc, s3, v1, s3
 ; VI-NEXT:    v_rcp_f32_e32 v4, v2
@@ -827,10 +923,13 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; VI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4...
[truncated]

jayfoad · 2023-10-27T12:16:30Z

llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll

+; CI-NEXT:    v_fma_f32 v1, -v2, v1, v0
+; CI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; CI-NEXT:    v_or_b32_e32 v0, v1, v0


Note that the fix is applied here in frem_f16 and in unsafe_frem_f16 but not in fast_frem_f16.

jayfoad · 2023-10-27T12:17:23Z

@b-sumner FYI.

github-actions · 2023-10-27T12:25:21Z

✅ With the latest revision this PR passed the C/C++ code formatter.

jayfoad · 2023-10-30T16:57:55Z

It has been pointed out to me that this also changes the sign of the result when the division x / y overflows.

Previously: frem(huge, tiny) -> infinity with opposite sign from x
Now: frem(huge, tiny) -> infinity with same sign as x

I'm not sure if this is a good thing or a bad thing.

arsenm · 2023-11-08T04:21:26Z

I think the frem lowering is just broken in general for large values

arsenm

Pretty sure frem is just generally broken, and this at least fixes one way it's broken. We never wired up OpenCL to use frem for example, so I didn't know anything was actually using this.

jayfoad · 2023-11-08T09:52:08Z

The motivation for this has gone away. I wanted it for Vulkan/SPIR-V but it turns out that SPIR-V does not specify the sign of a zero result from OpFRem.

jayfoad · 2023-11-14T14:32:36Z

Actually there is still interest in this, either for its own sake (to follow the LLVM definition in LangRef) or because the SPIR-V definition may change to match the LLVM definition.

jayfoad · 2023-11-14T14:36:40Z

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

+
+// Legalize frem(x, y) -> copysign(x - y * trunc(x / y), x)
+// The copysign is only required to get the correct result -0.0 when x is -0.0
+// (and y is non-zero). With NSZ it can be dropped.


Could also drop it if isKnownNeverZero(x)?

arsenm · 2023-11-30T14:07:32Z

Should probably do this if it follows the LangRef. But again, this lowering is wrong to begin with

jayfoad · 2023-11-30T21:24:08Z

Actually there is still interest in this, either for its own sake (to follow the LLVM definition in LangRef) or because the SPIR-V definition may change to match the LLVM definition.

The last I heard from Khronos is that the SPIR-V definition is not going to change - the sign of a zero result will remain undefined.

[AMDGPU] Fix legalization of frem(-0.0, y)

8c60b73

We legalized frem(x, y) -> x - y * trunc(x / y). When x is -0.0 this evaluates to +0.0 but the result should be -0.0. Fix this by legalizing to copysign(x - y * trunc(x / y), x).

llvmbot added backend:AMDGPU llvm:globalisel labels Oct 27, 2023

jayfoad commented Oct 27, 2023

View reviewed changes

clang-format

7b6faba

amdrexu mentioned this pull request Oct 27, 2023

Fix signed zero issues of FRem GPUOpen-Drivers/llpc#2782

Closed

arsenm approved these changes Nov 8, 2023

View reviewed changes

jayfoad closed this Nov 8, 2023

jayfoad reopened this Nov 14, 2023

jayfoad mentioned this pull request Nov 14, 2023

Fix signed zero issues of FRem GPUOpen-Drivers/llpc#2813

Closed

jayfoad commented Nov 14, 2023

View reviewed changes

jayfoad closed this by deleting the head repository Jan 29, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Fix legalization of frem(-0.0, y) #70448

[AMDGPU] Fix legalization of frem(-0.0, y) #70448

jayfoad commented Oct 27, 2023

llvmbot commented Oct 27, 2023 •

edited

jayfoad Oct 27, 2023

jayfoad commented Oct 27, 2023

github-actions bot commented Oct 27, 2023 •

edited

jayfoad commented Oct 30, 2023

arsenm commented Nov 8, 2023

arsenm left a comment

jayfoad commented Nov 8, 2023

jayfoad commented Nov 14, 2023

jayfoad Nov 14, 2023

arsenm commented Nov 30, 2023

jayfoad commented Nov 30, 2023

[AMDGPU] Fix legalization of frem(-0.0, y) #70448

[AMDGPU] Fix legalization of frem(-0.0, y) #70448

Conversation

jayfoad commented Oct 27, 2023

llvmbot commented Oct 27, 2023 • edited

jayfoad Oct 27, 2023

Choose a reason for hiding this comment

jayfoad commented Oct 27, 2023

github-actions bot commented Oct 27, 2023 • edited

jayfoad commented Oct 30, 2023

arsenm commented Nov 8, 2023

arsenm left a comment

Choose a reason for hiding this comment

jayfoad commented Nov 8, 2023

jayfoad commented Nov 14, 2023

jayfoad Nov 14, 2023

Choose a reason for hiding this comment

arsenm commented Nov 30, 2023

jayfoad commented Nov 30, 2023

llvmbot commented Oct 27, 2023 •

edited

github-actions bot commented Oct 27, 2023 •

edited