[DAG] narrowExtractedVectorBinOp - ensure we limit late node creation to LegalOperations only #72130

RKSimon · 2023-11-13T16:14:04Z

Avoids infinite issues in some upcoming patches to help D152928

llvmbot · 2023-11-13T16:14:31Z

@llvm/pr-subscribers-llvm-analysis
@llvm/pr-subscribers-backend-aarch64
@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-llvm-selectiondag

Author: Simon Pilgrim (RKSimon)

Changes

Avoids infinite issues in some upcoming patches to help D152928

Full diff: https://github.com/llvm/llvm-project/pull/72130.diff

6 Files Affected:

(modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+2-1)
(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+10)
(modified) llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll (+6-7)
(modified) llvm/test/CodeGen/X86/avx512-insert-extract.ll (+4-6)
(modified) llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll (+10-7)
(modified) llvm/test/CodeGen/X86/kshift.ll (+7-5)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a867d88f76c0cf6..f54d45d883d1927 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23960,7 +23960,8 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
   // Bail out if the target does not support a narrower version of the binop.
   EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
                                    WideNumElts / NarrowingRatio);
-  if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
+  if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT,
+                                             LegalOperations))
     return SDValue();
 
   // If extraction is cheap, we don't need to look at the binop operands
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3d44c50b44e6234..99f225a7d702981 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41371,8 +41371,18 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     case X86ISD::UNPCKH:
     case X86ISD::BLENDI:
       // Integer ops.
+    case X86ISD::PCMPEQ:
+    case X86ISD::PCMPGT:
     case X86ISD::PACKSS:
     case X86ISD::PACKUS:
+    case X86ISD::VSHLV:
+    case X86ISD::VSRLV:
+    case X86ISD::VSRAV:
+      // Float Ops.
+    case X86ISD::FMAX: 
+    case X86ISD::FMIN: 
+    case X86ISD::FMAXC: 
+    case X86ISD::FMINC: 
       // Horizontal Ops.
     case X86ISD::HADD:
     case X86ISD::HSUB:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
index df35b4ecb3d6623..5e477e8947d1b86 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -1860,15 +1860,14 @@ define i64 @umaxv_v3i64(<3 x i64> %a) {
 ; CHECK-LABEL: umaxv_v3i64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v3.16b, v2.16b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov v3.16b, v0.16b
-; CHECK-NEXT:    mov v4.16b, v2.16b
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-NEXT:    mov v3.d[1], v1.d[0]
-; CHECK-NEXT:    mov v4.d[1], xzr
-; CHECK-NEXT:    cmhi v3.2d, v3.2d, v4.2d
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    mov v3.d[1], xzr
+; CHECK-NEXT:    cmhi v3.2d, v0.2d, v3.2d
 ; CHECK-NEXT:    ext v4.16b, v3.16b, v3.16b, #8
-; CHECK-NEXT:    bif v0.8b, v2.8b, v3.8b
+; CHECK-NEXT:    bif v0.16b, v2.16b, v3.16b
 ; CHECK-NEXT:    and v1.8b, v1.8b, v4.8b
 ; CHECK-NEXT:    cmhi d2, d0, d1
 ; CHECK-NEXT:    bif v0.8b, v1.8b, v2.8b
@@ -1930,4 +1929,4 @@ define i128 @umaxv_v2i128(<2 x i128> %a) {
 entry:
   %arg1 = call i128 @llvm.vector.reduce.umax.v2i128(<2 x i128> %a)
   ret i128 %arg1
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 40cbbfe2c14e5d4..4a2dd7673f4e767 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1077,11 +1077,10 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) nounwin
 ; KNL-LABEL: test_extractelement_v64i1:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT:    vpminub %ymm1, %ymm0, %ymm1
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpminub %xmm1, %xmm0, %xmm1
-; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1113,11 +1112,10 @@ define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) nounwind
 ; KNL-LABEL: extractelement_v64i1_alt:
 ; KNL:       ## %bb.0:
 ; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT:    vpminub %ymm1, %ymm0, %ymm1
+; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpminub %xmm1, %xmm0, %xmm1
-; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 080ad3a7b0b463e..69f893ac829d603 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1073,18 +1073,21 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ;
 ; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
 ; CHECK-AVX2:       # %bb.0:
-; CHECK-AVX2-NEXT:    subq $40, %rsp
-; CHECK-AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
-; CHECK-AVX2-NEXT:    vpsllvd %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT:    subq $56, %rsp
+; CHECK-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [2,2,0,0,2,2,0,0]
+; CHECK-AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; CHECK-AVX2-NEXT:    vpsllvd %ymm0, %ymm1, %ymm0
+; CHECK-AVX2-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; CHECK-AVX2-NEXT:    vpextrw $2, %xmm0, %eax
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    vzeroupper
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; CHECK-AVX2-NEXT:    vpextrw $0, %xmm0, %eax
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm2, %xmm0
+; CHECK-AVX2-NEXT:    vzeroupper
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
 ; CHECK-AVX2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1097,7 +1100,7 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; CHECK-AVX2-NEXT:    addq $40, %rsp
+; CHECK-AVX2-NEXT:    addq $56, %rsp
 ; CHECK-AVX2-NEXT:    retq
 ;
 ; CHECK-NO-FASTFMA-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll
index f4efacc1946cff9..0acf82f5a144a2e 100644
--- a/llvm/test/CodeGen/X86/kshift.ll
+++ b/llvm/test/CodeGen/X86/kshift.ll
@@ -270,10 +270,11 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
 ; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
-; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
+; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -563,13 +564,14 @@ define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
 ; KNL-LABEL: kshiftr_v64i1_63:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k1
-; KNL-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm0
+; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax

github-actions · 2023-11-13T16:16:53Z

✅ With the latest revision this PR passed the C/C++ code formatter.

davemgreen · 2023-11-13T22:08:18Z

It looks like there might be a costmodel test that needs updating? It includes some codegen tests it seems.

arsenm · 2023-11-14T01:30:17Z

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

@@ -23960,7 +23960,8 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
  // Bail out if the target does not support a narrower version of the binop.
  EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
                                   WideNumElts / NarrowingRatio);
-  if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
+  if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT,
+                                             LegalOperations))


This part seems obvious

arsenm · 2023-11-14T01:30:47Z

llvm/lib/Target/X86/X86ISelLowering.cpp

@@ -41371,8 +41371,18 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
    case X86ISD::UNPCKH:
    case X86ISD::BLENDI:
      // Integer ops.
+    case X86ISD::PCMPEQ:


Don't know enough about x86 but seems like there would need to be more tests hitting these cases?

I split this into 2 commits so the review can see the x86 regressions and the followup that addresses them - I intend to squash+merge these - do you think that's enough?

RKSimon · 2023-11-14T09:36:57Z

Sorry this was my first attempt at a patch series and I screwed it up - converting to draft until I've fixed it up

RKSimon · 2023-11-14T13:00:23Z

llvm/test/CodeGen/X86/kshift.ll

@@ -270,10 +270,11 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
 ; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0


This is an existing (very old) known issue we're having in that the 128/256/512-bit zero vectors don't get merged properly: #26392

davemgreen · 2023-11-18T08:19:54Z

I can't speak for the X86/kshift change but this looks OK to me.

… to LegalOperations only Avoids infinite issues in some upcoming patches to help D152928 - x86 sees a number of regressions that will be addressed in follow-ups

…t/512-bit X86 binops that can be narrowed to smaller vector widths

… to LegalOperations only (llvm#72130) Avoids infinite issues in some upcoming patches to help D152928 - x86 sees a number of regressions that are addressed by extending SimplifyDemandedVectorEltsForTargetNode to cover more binop opcodes

RKSimon requested review from davemgreen, topperc and goldsteinn November 13, 2023 16:14

llvmbot added backend:AArch64 backend:X86 llvm:SelectionDAG SelectionDAGISel as well labels Nov 13, 2023

RKSimon marked this pull request as draft November 13, 2023 18:56

arsenm reviewed Nov 14, 2023

View reviewed changes

RKSimon force-pushed the legal-binop-subvector branch from a5e25b1 to 57559c9 Compare November 14, 2023 09:32

RKSimon force-pushed the legal-binop-subvector branch 2 times, most recently from 296a017 to 113ce52 Compare November 14, 2023 10:47

RKSimon marked this pull request as ready for review November 14, 2023 12:57

llvmbot added the llvm:analysis label Nov 14, 2023

RKSimon commented Nov 14, 2023

View reviewed changes

davemgreen approved these changes Nov 18, 2023

View reviewed changes

RKSimon added 2 commits November 20, 2023 10:29

[DAG] narrowExtractedVectorBinOp - ensure we limit late node creation…

b5a4460

… to LegalOperations only Avoids infinite issues in some upcoming patches to help D152928 - x86 sees a number of regressions that will be addressed in follow-ups

[X86] SimplifyDemandedVectorEltsForTargetNode - add additional 256-bi…

92061a6

…t/512-bit X86 binops that can be narrowed to smaller vector widths

RKSimon force-pushed the legal-binop-subvector branch from 113ce52 to 92061a6 Compare November 20, 2023 10:36

RKSimon merged commit 761a963 into llvm:main Nov 20, 2023
2 of 3 checks passed

RKSimon deleted the legal-binop-subvector branch November 20, 2023 10:56

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[DAG] narrowExtractedVectorBinOp - ensure we limit late node creation to LegalOperations only #72130

[DAG] narrowExtractedVectorBinOp - ensure we limit late node creation to LegalOperations only #72130

RKSimon commented Nov 13, 2023

llvmbot commented Nov 13, 2023 •

edited

Loading

github-actions bot commented Nov 13, 2023 •

edited

Loading

davemgreen commented Nov 13, 2023

arsenm Nov 14, 2023

arsenm Nov 14, 2023

RKSimon Nov 14, 2023

RKSimon commented Nov 14, 2023

RKSimon Nov 14, 2023 •

edited

Loading

davemgreen commented Nov 18, 2023

[DAG] narrowExtractedVectorBinOp - ensure we limit late node creation to LegalOperations only #72130

[DAG] narrowExtractedVectorBinOp - ensure we limit late node creation to LegalOperations only #72130

Conversation

RKSimon commented Nov 13, 2023

llvmbot commented Nov 13, 2023 • edited Loading

github-actions bot commented Nov 13, 2023 • edited Loading

davemgreen commented Nov 13, 2023

arsenm Nov 14, 2023

Choose a reason for hiding this comment

arsenm Nov 14, 2023

Choose a reason for hiding this comment

RKSimon Nov 14, 2023

Choose a reason for hiding this comment

RKSimon commented Nov 14, 2023

RKSimon Nov 14, 2023 • edited Loading

Choose a reason for hiding this comment

davemgreen commented Nov 18, 2023

llvmbot commented Nov 13, 2023 •

edited

Loading

github-actions bot commented Nov 13, 2023 •

edited

Loading

RKSimon Nov 14, 2023 •

edited

Loading