[CodeGen] ExpandReductions - add reduce_and/or(<X x i1> V) -> icmp(iX…

… bitcast(<X x i1> V)) canonicalization This already exists in InstCombine but was missing from the late stage ExpandReductions pass Fixes #53419 Fixes #61923 Differential Revision: https://reviews.llvm.org/D147452
llvm · Apr 4, 2023 · 00e3ae4 · 00e3ae4
1 parent 1b16c70
commit 00e3ae4
Show file tree

Hide file tree

Showing 5 changed files with 1,039 additions and 2,331 deletions.
diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -133,10 +133,38 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
       }
       break;
     }
+    case Intrinsic::vector_reduce_and:
+    case Intrinsic::vector_reduce_or: {
+      // Canonicalize logical or/and reductions:
+      // Or reduction for i1 is represented as:
+      // %val = bitcast <ReduxWidth x i1> to iReduxWidth
+      // %res = cmp ne iReduxWidth %val, 0
+      // And reduction for i1 is represented as:
+      // %val = bitcast <ReduxWidth x i1> to iReduxWidth
+      // %res = cmp eq iReduxWidth %val, 11111
+      Value *Vec = II->getArgOperand(0);
+      auto *FTy = cast<FixedVectorType>(Vec->getType());
+      unsigned NumElts = FTy->getNumElements();
+      if (!isPowerOf2_32(NumElts))
+        continue;
+
+      if (FTy->getElementType() == Builder.getInt1Ty()) {
+        Rdx = Builder.CreateBitCast(Vec, Builder.getIntNTy(NumElts));
+        if (ID == Intrinsic::vector_reduce_and) {
+          Rdx = Builder.CreateICmpEQ(
+              Rdx, ConstantInt::getAllOnesValue(Rdx->getType()));
+        } else {
+          assert(ID == Intrinsic::vector_reduce_or && "Expected or reduction.");
+          Rdx = Builder.CreateIsNotNull(Rdx);
+        }
+        break;
+      }
+
+      Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), RK);
+      break;
+    }
     case Intrinsic::vector_reduce_add:
     case Intrinsic::vector_reduce_mul:
-    case Intrinsic::vector_reduce_and:
-    case Intrinsic::vector_reduce_or:
     case Intrinsic::vector_reduce_xor:
     case Intrinsic::vector_reduce_smax:
     case Intrinsic::vector_reduce_smin:

diff --git a/llvm/test/CodeGen/X86/pr53419.ll b/llvm/test/CodeGen/X86/pr53419.ll
@@ -10,22 +10,22 @@ declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>)
 declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>)
 declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>)
 
-; FIXME: All four versions are semantically equivalent and should produce same asm as scalar version.
+; All four versions are semantically equivalent and should produce same asm as scalar version.
 
 define i1 @intrinsic_v2i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X64-LABEL: intrinsic_v2i8:
 ; X64:       # %bb.0: # %bb
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw %ax, (%rsi)
+; X64-NEXT:    movzwl (%rsi), %eax
+; X64-NEXT:    cmpw (%rdi), %ax
 ; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: intrinsic_v2i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpw %ax, (%ecx)
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -39,17 +39,17 @@ bb:
 define i1 @intrinsic_v4i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X64-LABEL: intrinsic_v4i8:
 ; X64:       # %bb.0: # %bb
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl %eax, (%rsi)
+; X64-NEXT:    movl (%rsi), %eax
+; X64-NEXT:    cmpl (%rdi), %eax
 ; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: intrinsic_v4i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    cmpl %eax, (%ecx)
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:
@@ -63,20 +63,20 @@ bb:
 define i1 @intrinsic_v8i8(ptr align 1 %arg, ptr align 1 %arg1) {
 ; X64-LABEL: intrinsic_v8i8:
 ; X64:       # %bb.0: # %bb
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq %rax, (%rsi)
+; X64-NEXT:    movq (%rsi), %rax
+; X64-NEXT:    cmpq (%rdi), %rax
 ; X64-NEXT:    sete %al
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: intrinsic_v8i8:
 ; X86:       # %bb.0: # %bb
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X86-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; X86-NEXT:    vpmovmskb %xmm0, %eax
-; X86-NEXT:    cmpb $-1, %al
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    retl
 bb:

diff --git a/llvm/test/CodeGen/X86/pr61923.ll b/llvm/test/CodeGen/X86/pr61923.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; FIXME: Regressed test from https://github.com/llvm/llvm-project/issues/61923
+; Regressed test from https://github.com/llvm/llvm-project/issues/61923
 define void @test_loop(ptr align 1 %src, ptr align 1 %dest, i32 %len) {
 ; CHECK-LABEL: test_loop:
 ; CHECK:       # %bb.0: # %entry
@@ -17,22 +17,17 @@ define void @test_loop(ptr align 1 %src, ptr align 1 %dest, i32 %len) {
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_2: # %memcmp.loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmovdqu (%rsi,%rcx), %xmm0
-; CHECK-NEXT:    vmovdqu 16(%rsi,%rcx), %xmm1
-; CHECK-NEXT:    vpcmpeqb (%rdi,%rcx), %xmm0, %xmm0
-; CHECK-NEXT:    vpmovmskb %xmm0, %edx
-; CHECK-NEXT:    vpcmpeqb 16(%rdi,%rcx), %xmm1, %xmm0
-; CHECK-NEXT:    vpmovmskb %xmm0, %r8d
-; CHECK-NEXT:    shll $16, %r8d
-; CHECK-NEXT:    orl %edx, %r8d
-; CHECK-NEXT:    cmpl $-1, %r8d
+; CHECK-NEXT:    vmovups (%rsi,%rcx), %ymm0
+; CHECK-NEXT:    vxorps (%rdi,%rcx), %ymm0, %ymm0
+; CHECK-NEXT:    vptest %ymm0, %ymm0
 ; CHECK-NEXT:    jne .LBB0_4
 ; CHECK-NEXT:  # %bb.3: # %memcmp.loop.latch
 ; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    addq $32, %rcx
 ; CHECK-NEXT:    cmpq %rax, %rcx
 ; CHECK-NEXT:    jb .LBB0_2
 ; CHECK-NEXT:  .LBB0_4: # %done
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
   %len.wide = zext i32 %len to i64