Skip to content

Conversation

alexey-bataev
Copy link
Member

Enables Shl matching for the nodes, where copyable can be modelled as
shl %v, 0

Created using spr 1.3.7
@llvmbot
Copy link
Member

llvmbot commented Sep 3, 2025

@llvm/pr-subscribers-vectorizers
@llvm/pr-subscribers-backend-risc-v

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Enables Shl matching for the nodes, where copyable can be modelled as
shl %v, 0


Full diff: https://github.com/llvm/llvm-project/pull/156766.diff

5 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+5-1)
  • (modified) llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll (+4-8)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll (+2-2)
  • (modified) llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll (+6-26)
  • (modified) llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll (+37-18)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9c6099c2da307..bd5da774154d6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10573,7 +10573,8 @@ class InstructionsCompatibilityAnalysis {
   /// Checks if the opcode is supported as the main opcode for copyable
   /// elements.
   static bool isSupportedOpcode(const unsigned Opcode) {
-    return Opcode == Instruction::Add || Opcode == Instruction::LShr;
+    return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
+           Opcode == Instruction::Shl;
   }
 
   /// Identifies the best candidate value, which represents main opcode
@@ -10890,6 +10891,7 @@ class InstructionsCompatibilityAnalysis {
       switch (MainOpcode) {
       case Instruction::Add:
       case Instruction::LShr:
+      case Instruction::Shl:
         VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
         break;
       default:
@@ -21902,6 +21904,8 @@ bool BoUpSLP::collectValuesToDemote(
       return all_of(E.Scalars, [&](Value *V) {
         if (isa<PoisonValue>(V))
           return true;
+        if (E.isCopyableElement(V))
+          return true;
         auto *I = cast<Instruction>(V);
         KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
         return AmtKnownBits.getMaxValue().ult(BitWidth);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
index 655db54af98ac..a079203696cdd 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll
@@ -10,14 +10,10 @@ define void @test() {
 ; CHECK-NEXT:    [[SUB4_I_I65_US:%.*]] = or i64 0, 1
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
-; CHECK-NEXT:    [[ADD_I_I62_US:%.*]] = shl i64 0, 0
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> <i64 poison, i64 1>, i64 [[ADD_I_I62_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP2]], i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[SUB4_I_I65_US]], i64 0
 ; CHECK-NEXT:    br label [[BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
index 77585965d68e9..87f2ccaf76497 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
@@ -8,8 +8,8 @@ define i32 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    store i32 152, ptr @f, align 4
 ; CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD_I:%.*]] = load i32, ptr @f, align 4
-; CHECK-NEXT:    [[ADD_I_I:%.*]] = shl i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], 24
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[ADD_I_I]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = shl <8 x i32> [[TMP3]], <i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> <i32 83886080, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr <8 x i32> [[TMP1]], splat (i32 24)
 ; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i32> [[TMP2]], <i32 66440127, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
index 75aec45d3e2fc..3e0a3741d6bbc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@@ -247,32 +247,12 @@ entry:
 }
 
 define void @shl0(ptr noalias %dst, ptr noalias %src) {
-; NON-POW2-LABEL: @shl0(
-; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; NON-POW2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; NON-POW2-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; NON-POW2-NEXT:    store i32 [[TMP0]], ptr [[DST]], align 4
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4
-; NON-POW2-NEXT:    [[TMP2:%.*]] = shl <3 x i32> [[TMP1]], <i32 1, i32 2, i32 3>
-; NON-POW2-NEXT:    store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; NON-POW2-NEXT:    ret void
-;
-; POW2-ONLY-LABEL: @shl0(
-; POW2-ONLY-NEXT:  entry:
-; POW2-ONLY-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
-; POW2-ONLY-NEXT:    store i32 [[TMP0]], ptr [[DST]], align 4
-; POW2-ONLY-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
-; POW2-ONLY-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
-; POW2-ONLY-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
-; POW2-ONLY-NEXT:    [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 2>
-; POW2-ONLY-NEXT:    store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
-; POW2-ONLY-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
-; POW2-ONLY-NEXT:    [[SHL8:%.*]] = shl i32 [[TMP3]], 3
-; POW2-ONLY-NEXT:    store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
-; POW2-ONLY-NEXT:    ret void
+; CHECK-LABEL: @shl0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
index a5b1e9b4575f0..769b3604d41c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
@@ -1,25 +1,44 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}
 
 
 define i1 @test(i32 %0, i32 %1, i32 %p) {
-; CHECK-LABEL: define i1 @test(
-; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp slt i32 0, [[P]]
-; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
-; CHECK-NEXT:    ret i1 [[OP_RDX2]]
+; X86-LABEL: define i1 @test(
+; X86-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
+; X86-NEXT:  entry:
+; X86-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
+; X86-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; X86-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
+; X86-NEXT:    [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
+; X86-NEXT:    [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
+; X86-NEXT:    [[CMP6:%.*]] = icmp slt i32 0, [[P]]
+; X86-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; X86-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
+; X86-NEXT:    [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
+; X86-NEXT:    [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
+; X86-NEXT:    [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
+; X86-NEXT:    ret i1 [[OP_RDX2]]
+;
+; AARCH64-LABEL: define i1 @test(
+; AARCH64-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
+; AARCH64-NEXT:    [[SHL4:%.*]] = shl i32 0, [[TMP1]]
+; AARCH64-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[SHL4]], 0
+; AARCH64-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
+; AARCH64-NEXT:    [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
+; AARCH64-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[P]], i32 0
+; AARCH64-NEXT:    [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP5]]
+; AARCH64-NEXT:    [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
+; AARCH64-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
+; AARCH64-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP5]]
+; AARCH64-NEXT:    [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
+; AARCH64-NEXT:    [[TMP9:%.*]] = freeze i1 [[OP_RDX]]
+; AARCH64-NEXT:    [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 true, i1 [[OP_RDX1]]
+; AARCH64-NEXT:    ret i1 [[OP_RDX2]]
 ;
 entry:
   %cmp1 = icmp sgt i32 %0, 0

; X86-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
; X86-NEXT: ret i1 [[OP_RDX2]]
;
; AARCH64-LABEL: define i1 @test(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is surprising given aarch64 has better non-uniform vector shift handling than x86/SSE

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alexey-bataev were you able to get to the bottom of this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, this test shows it. X86 fails to vectorize initial reduction

    %cmp6 = icmp slt i32 0, %p
    %cmp2 = icmp slt i32 %shl1, 0
    %cmp3 = icmp slt i32 %shl2, 0
    %cmp4 = icmp slt i32 %shl3, 0

while AArch64 is able to vectorize it.
X86 instead vectorizes next candidate:

    %cmp2 = icmp slt i32 %shl1, 0
    %cmp3 = icmp slt i32 %shl2, 0
    %cmp4 = icmp slt i32 %shl3, 0
    %cmp5 = icmp slt i32 %shl4, 0

AArch64 should do it better with enabled non-power-of-2

Created using spr 1.3.7
Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@alexey-bataev alexey-bataev merged commit 5d7f324 into main Oct 6, 2025
9 checks passed
@alexey-bataev alexey-bataev deleted the users/alexey-bataev/spr/slpenable-shl-as-a-base-opcode-in-copyables branch October 6, 2025 11:07
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants