[CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf #81572

dtcxzyw · 2024-02-13T05:34:22Z

In commit 2b58244, we canonicalize the isInf/isNanOrInf idiom into fabs+fcmp for better analysis/codegen (See also the discussion in #76338).

This patch reverses the fabs+fcmp to is.fpclass. If the is.fpclass is not supported by the target, it will be expanded by TLI.

Fixes the regression introduced by 2b58244 and #80414 (comment).

It is an alternative to #81404.

llvmbot · 2024-02-13T05:34:52Z

@llvm/pr-subscribers-backend-powerpc
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-aarch64

Author: Yingwei Zheng (dtcxzyw)

Changes

In commit 2b58244, we canonicalize the isInf/isNanOrInf idiom into fabs+fcmp for better analysis/codegen (See also the discussion in #76338).

This patch reverses the fabs+fcmp to is.fpclass. If the is.fpclass is not supported by the target, it will be expanded by TLI.

Fixes the regression introduced by 2b58244 and #80414 (comment).

It is an alternative to #81404.

Patch is 66.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81572.diff

7 Files Affected:

(modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+27)
(modified) llvm/test/CodeGen/AArch64/isinf.ll (+7-15)
(modified) llvm/test/CodeGen/AMDGPU/fp-classify.ll (+114-82)
(modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+130-129)
(added) llvm/test/Transforms/CodeGenPrepare/AArch64/fpclass-test.ll (+134)
(added) llvm/test/Transforms/CodeGenPrepare/RISCV/fpclass-test.ll (+134)
(added) llvm/test/Transforms/CodeGenPrepare/X86/fpclass-test.ll (+178)

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 09c4922d8822cc..33e4aedb392d75 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1943,6 +1943,30 @@ static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) {
   return false;
 }
 
+static bool foldFCmpToFPClassTest(CmpInst *Cmp) {
+  FCmpInst *FCmp = dyn_cast<FCmpInst>(Cmp);
+  if (!FCmp)
+    return false;
+
+  // Reverse the canonicalization if it is a FP class test
+  auto ShouldReverseTransform = [](FPClassTest ClassTest) {
+    return ClassTest == fcInf || ClassTest == (fcInf | fcNan);
+  };
+  auto [ClassVal, ClassTest] =
+      fcmpToClassTest(FCmp->getPredicate(), *FCmp->getParent()->getParent(),
+                      FCmp->getOperand(0), FCmp->getOperand(1));
+  if (ClassVal && (ShouldReverseTransform(ClassTest) ||
+                   ShouldReverseTransform(~ClassTest))) {
+    IRBuilder<> Builder(Cmp);
+    Value *IsFPClass = Builder.createIsFPClass(ClassVal, ClassTest);
+    Cmp->replaceAllUsesWith(IsFPClass);
+    RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+    return true;
+  }
+
+  return false;
+}
+
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (sinkCmpExpression(Cmp, *TLI))
     return true;
@@ -1959,6 +1983,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
   if (swapICmpOperandsToExposeCSEOpportunities(Cmp))
     return true;
 
+  if (foldFCmpToFPClassTest(Cmp))
+    return true;
+
   return false;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll
index 458bd7eeba16cf..834417b98743a8 100644
--- a/llvm/test/CodeGen/AArch64/isinf.ll
+++ b/llvm/test/CodeGen/AArch64/isinf.ll
@@ -58,22 +58,14 @@ define i32 @replace_isinf_call_f64(double %x) {
 define i32 @replace_isinf_call_f128(fp128 %x) {
 ; CHECK-LABEL: replace_isinf_call_f128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #32
-; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    str q0, [sp]
-; CHECK-NEXT:    ldrb w8, [sp, #15]
-; CHECK-NEXT:    and w8, w8, #0x7f
-; CHECK-NEXT:    strb w8, [sp, #15]
-; CHECK-NEXT:    adrp x8, .LCPI3_0
-; CHECK-NEXT:    ldr q0, [sp]
-; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT:    bl __eqtf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    str q0, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldp x9, x8, [sp], #16
+; CHECK-NEXT:    and x8, x8, #0x7fffffffffffffff
+; CHECK-NEXT:    eor x8, x8, #0x7fff000000000000
+; CHECK-NEXT:    orr x8, x9, x8
+; CHECK-NEXT:    cmp x8, #0
 ; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   %abs = tail call fp128 @llvm.fabs.f128(fp128 %x)
   %cmpinf = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 6fa7df913812a3..ed9ce4d62383b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -61,10 +61,10 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
 ; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; SI-NEXT:    v_mov_b32_e32 v0, 0x207
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cmp_nlg_f32_e64 s[0:1], |s0|, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s0, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -72,11 +72,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; VI-NEXT:    v_mov_b32_e32 v0, 0x207
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_nlg_f32_e64 s[2:3], |s2|, v0
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -88,7 +88,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s2, 0x207
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -143,25 +143,29 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
 ; SI-LABEL: test_isfinite_pattern_0:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x1f8
+; SI-NEXT:    v_mov_b32_e32 v0, 0x1fb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cmp_class_f32_e32 vcc, s0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    v_cmp_o_f32_e64 s[0:1], s2, s2
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isfinite_pattern_0:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT:    v_mov_b32_e32 v0, 0x1fb
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v0
+; VI-NEXT:    v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -173,8 +177,10 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_o_f32_e64 s3, s2, s2
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s2, 0x1fb
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
@@ -349,13 +355,13 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; SI-NEXT:    v_mov_b32_e32 v0, 0x1fb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    v_cmp_o_f32_e64 s[0:1], s2, s2
-; SI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s3|, v0
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -363,11 +369,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
 ; VI-LABEL: test_isfinite_not_pattern_2:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; VI-NEXT:    v_mov_b32_e32 v0, 0x1fb
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_o_f32_e64 s[4:5], s2, s2
-; VI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s3|, v0
-; VI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s3, v0
+; VI-NEXT:    s_and_b64 s[2:3], s[4:5], vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -380,7 +386,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_o_f32_e64 s2, s2, s2
-; GFX11-NEXT:    v_cmp_neq_f32_e64 s3, 0x7f800000, |s3|
+; GFX11-NEXT:    v_cmp_class_f32_e64 s3, s3, 0x1fb
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s2, s2, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
@@ -405,11 +411,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; SI-NEXT:    v_mov_b32_e32 v0, 0x1fb
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cmp_u_f32_e64 s[0:1], s2, s2
-; SI-NEXT:    v_cmp_neq_f32_e64 s[2:3], |s2|, v0
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -418,11 +424,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0x7f800000
+; VI-NEXT:    v_mov_b32_e32 v0, 0x1fb
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cmp_u_f32_e64 s[2:3], s4, s4
-; VI-NEXT:    v_cmp_neq_f32_e64 s[4:5], |s4|, v0
-; VI-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -437,7 +443,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_cmp_u_f32_e64 s3, s2, s2
-; GFX11-NEXT:    v_cmp_neq_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s2, 0x1fb
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s2, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
@@ -458,25 +464,29 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
 ; SI-LABEL: test_isfinite_pattern_4:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cmp_class_f32_e32 vcc, s0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    v_cmp_o_f32_e64 s[0:1], s2, s2
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], vcc
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isfinite_pattern_4:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v0
+; VI-NEXT:    v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -488,8 +498,10 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_o_f32_e64 s3, s2, s2
 ; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
@@ -508,25 +520,29 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
 ; SI-LABEL: test_isfinite_pattern_4_commute_and:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cmp_class_f32_e32 vcc, s0, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    v_cmp_o_f32_e64 s[0:1], s2, s2
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isfinite_pattern_4_commute_and:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_class_f32_e32 vcc, s2, v0
+; VI-NEXT:    v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], vcc, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -538,8 +554,10 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_o_f32_e64 s3, s2, s2
 ; GFX11-NEXT:    v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s2, s3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
@@ -618,16 +636,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
 define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
 ; SI-LABEL: test_isinf_pattern_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_mov_b32 s1, 0x7f800000
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
-; SI-NEXT:    v_cmp_eq_f32_e32 vcc, s1, v0
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    s_cmpk_eq_i32 s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isinf_pattern_f16:
@@ -667,27 +685,32 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
 define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_0_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s1, 0x1f8
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT:    s_cmpk_lg_i32 s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isfinite_pattern_0_f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT:    v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT:    v_mov_b32_e32 v0, 0x1fb
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_cmp_class_f16_e32 vcc, s2, v0
+; VI-NEXT:    v_cmp_o_f16_e64 s[2:3], s4, s4
+; VI-NEXT:    v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -699,8 +722,10 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_class_f16_e64 s2, s2, 0x1f8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_o_f16_e64 s3, s2, s2
+; GFX11-NEXT:    v_cmp_class_f16_e64 s2, s2, 0x1fb
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s2, s3, s2
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
@@ -718,27 +743,32 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
 define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
 ; SI-LABEL: test_isfinite_pattern_4_f16:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_movk_i32 s1, 0x1f8
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x7fff
+; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT:    s_cmpk_lt_i32 s4, 0x7c00
+; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: test_isfinite_...
[truncated]

andykaylor · 2024-02-14T23:33:06Z

llvm/test/Transforms/CodeGenPrepare/AArch64/fpclass-test.ll

+define i1 @test_fp128_is_inf_or_nan(fp128 %arg) {
+; CHECK-LABEL: define i1 @test_fp128_is_inf_or_nan(
+; CHECK-SAME: fp128 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 519)


This doesn't fully reverse the canonicalization. We still lower @llvm.is.fpclass.f128() using runtime library functions in the x86 backend.

If is.fpclass is not natively supported, SDAG will expand it.
Godbolt: https://godbolt.org/z/dY3r3cr7j

dtcxzyw · 2024-02-19T09:50:08Z

Ping.

dtcxzyw · 2024-02-25T13:33:48Z

Ping.

llvm/lib/CodeGen/CodeGenPrepare.cpp

llvm/test/CodeGen/AMDGPU/fp-classify.ll

FreddyLeaf · 2024-03-11T13:04:52Z

ping for review. X86 required this patch to fix below regression: https://gcc.godbolt.org/z/8P1do36qd

arsenm · 2024-03-12T05:25:25Z

ping for review. X86 required this patch to fix below regression: https://gcc.godbolt.org/z/8P1do36qd

Still has the minor outstanding comment about checking legality

dtcxzyw · 2024-03-12T08:30:58Z

ping for review. X86 required this patch to fix below regression: https://gcc.godbolt.org/z/8P1do36qd

Sorry for the delay.

llvm/lib/CodeGen/CodeGenPrepare.cpp

dtcxzyw · 2024-03-13T12:03:50Z

Ping @andykaylor

FreddyLeaf

LGTM for x86 tests.

dtcxzyw · 2024-03-14T08:45:12Z

Failed Tests (1):
LLVM :: CodeGen/PowerPC/fp-classify.ll

I will fix it later.

arsenm · 2024-03-15T09:44:21Z

llvm/test/CodeGen/PowerPC/fp-classify.ll

+; P8-NEXT:    xscvdpspn 0, 1
+; P8-NEXT:    lis 4, 32639
+; P8-NEXT:    ori 4, 4, 65535
+; P8-NEXT:    mffprwz 3, 0
+; P8-NEXT:    clrlwi 3, 3, 1
+; P8-NEXT:    sub 3, 4, 3
+; P8-NEXT:    rldicl 3, 3, 1, 63


This looks worse. The cost of expand for some compares isn't any more than legal operations, since it's just commuting.

Ping for review from PowerPC maintainers.

This looks worse. The cost of expand for some compares isn't any more than legal operations, since it's just commuting.

Before:

Iterations: 100 Instructions: 600 Total Cycles: 1905 Total uOps: 1200 Dispatch Width: 8 uOps Per Cycle: 0.63 IPC: 0.31 Block RThroughput: 1.5

After:

Iterations: 100 Instructions: 700 Total Cycles: 614 Total uOps: 1400 Dispatch Width: 8 uOps Per Cycle: 2.28 IPC: 1.14 Block RThroughput: 1.8

It looks better :)

Ping @bzEq @chenzheng1030 @ecnelises @stefanp-ibm

TBH, I prefer the transformed one. The unchanged one uses TOC entry for the inf constant. And the lfs is time consuming on PPC and has memory access overhead.

chenzheng1030 · 2024-03-18T09:05:47Z

llvm/test/CodeGen/PowerPC/fp-classify.ll

+; P8-NEXT:    xor 4, 4, 5
+; P8-NEXT:    or 3, 3, 4
+; P8-NEXT:    cntlzd 3, 3
+; P8-NEXT:    rldicl 3, 3, 58, 63


Nice improvement.

dtcxzyw · 2024-03-18T09:09:51Z

@chenzheng1030 Can I merge this PR?

chenzheng1030

LGTM for the PPC case changes.

dtcxzyw added 2 commits February 13, 2024 13:04

[CodeGenPrepare] Add pre-commit tests. NFC.

676c1c6

[CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf

7be9878

dtcxzyw requested review from arsenm, RKSimon, davemgreen, topperc, andykaylor and jcranmer-intel February 13, 2024 05:34

llvmbot added backend:AArch64 backend:AMDGPU llvm:transforms labels Feb 13, 2024

andykaylor reviewed Feb 14, 2024

View reviewed changes

arsenm reviewed Feb 25, 2024

View reviewed changes

dtcxzyw added 2 commits March 12, 2024 16:55

Merge branch 'main' into cgp-isfpclass-reverse

20f42fd

fixup! [CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf

f427c71

arsenm approved these changes Mar 13, 2024

View reviewed changes

llvm/lib/CodeGen/CodeGenPrepare.cpp Outdated Show resolved Hide resolved

FreddyLeaf approved these changes Mar 13, 2024

View reviewed changes

dtcxzyw added 2 commits March 14, 2024 15:59

Merge branch 'main' into cgp-isfpclass-reverse

ea15b96

fixup! [CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf

b4ddbe6

arsenm approved these changes Mar 14, 2024

View reviewed changes

dtcxzyw mentioned this pull request Mar 14, 2024

[SDAG] Reverse the canonicalization of isInf/isNanOrInf #81404

Closed

[CodeGenPrepare] Fix PowerPC tests.

64cd209

llvmbot added the backend:PowerPC label Mar 14, 2024

dtcxzyw requested review from bzEq, chenzheng1030, ecnelises and stefanp-ibm March 14, 2024 11:04

arsenm reviewed Mar 15, 2024

View reviewed changes

chenzheng1030 reviewed Mar 18, 2024

View reviewed changes

dtcxzyw merged commit 38a44bd into llvm:main Mar 18, 2024
5 checks passed

dtcxzyw deleted the cgp-isfpclass-reverse branch March 18, 2024 10:27

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf #81572

[CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf #81572

dtcxzyw commented Feb 13, 2024

llvmbot commented Feb 13, 2024 •

edited

andykaylor Feb 14, 2024

dtcxzyw Feb 15, 2024

dtcxzyw commented Feb 19, 2024

dtcxzyw commented Feb 25, 2024

FreddyLeaf commented Mar 11, 2024

arsenm commented Mar 12, 2024

dtcxzyw commented Mar 12, 2024

dtcxzyw commented Mar 13, 2024

FreddyLeaf left a comment

dtcxzyw commented Mar 14, 2024

arsenm Mar 15, 2024

dtcxzyw Mar 15, 2024

dtcxzyw Mar 15, 2024

dtcxzyw Mar 18, 2024

chenzheng1030 Mar 18, 2024

chenzheng1030 Mar 18, 2024

dtcxzyw commented Mar 18, 2024

chenzheng1030 left a comment

[CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf #81572

[CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf #81572

Conversation

dtcxzyw commented Feb 13, 2024

llvmbot commented Feb 13, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

dtcxzyw commented Feb 19, 2024

dtcxzyw commented Feb 25, 2024

FreddyLeaf commented Mar 11, 2024

arsenm commented Mar 12, 2024

dtcxzyw commented Mar 12, 2024

dtcxzyw commented Mar 13, 2024

FreddyLeaf left a comment

Choose a reason for hiding this comment

dtcxzyw commented Mar 14, 2024

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

dtcxzyw commented Mar 18, 2024

chenzheng1030 left a comment

Choose a reason for hiding this comment

llvmbot commented Feb 13, 2024 •

edited