From 6d381e5510d37ca43a68e123f2f2d4d4ac9398ac Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 3 Sep 2025 15:24:24 -0700 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.7 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 6 +- .../strided-loads-with-external-indices.ll | 12 ++-- .../X86/ext-used-scalar-different-bitwidth.ll | 4 +- .../X86/vect_copyable_in_binops.ll | 32 ++--------- .../bool-logical-op-reduction-with-poison.ll | 55 +++++++++++++------ 5 files changed, 54 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 9c6099c2da307..bd5da774154d6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10573,7 +10573,8 @@ class InstructionsCompatibilityAnalysis { /// Checks if the opcode is supported as the main opcode for copyable /// elements. static bool isSupportedOpcode(const unsigned Opcode) { - return Opcode == Instruction::Add || Opcode == Instruction::LShr; + return Opcode == Instruction::Add || Opcode == Instruction::LShr || + Opcode == Instruction::Shl; } /// Identifies the best candidate value, which represents main opcode @@ -10890,6 +10891,7 @@ class InstructionsCompatibilityAnalysis { switch (MainOpcode) { case Instruction::Add: case Instruction::LShr: + case Instruction::Shl: VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind); break; default: @@ -21902,6 +21904,8 @@ bool BoUpSLP::collectValuesToDemote( return all_of(E.Scalars, [&](Value *V) { if (isa(V)) return true; + if (E.isCopyableElement(V)) + return true; auto *I = cast(V); KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL); return AmtKnownBits.getMaxValue().ult(BitWidth); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll index 655db54af98ac..a079203696cdd 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-indices.ll @@ -10,14 +10,10 @@ define void @test() { ; CHECK-NEXT: [[SUB4_I_I65_US:%.*]] = or i64 0, 1 ; CHECK-NEXT: br label [[BODY:%.*]] ; CHECK: body: -; CHECK-NEXT: [[ADD_I_I62_US:%.*]] = shl i64 0, 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> , i64 [[ADD_I_I62_US]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP2]], i32 4, <2 x i1> splat (i1 true), <2 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 -; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> ), i32 4, <2 x i1> splat (i1 true), <2 x i32> poison) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[SUB4_I_I65_US]], i64 0 ; CHECK-NEXT: br label [[BODY]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll index 77585965d68e9..87f2ccaf76497 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll @@ -8,8 +8,8 @@ define i32 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: store i32 152, ptr @f, align 4 ; CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD_I:%.*]] = load i32, ptr @f, align 4 -; CHECK-NEXT: [[ADD_I_I:%.*]] = shl i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], 24 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[ADD_I_I]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> , i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = shl <8 x i32> [[TMP3]], ; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> , [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i32> [[TMP1]], splat (i32 24) ; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i32> [[TMP2]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll index 75aec45d3e2fc..3e0a3741d6bbc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -247,32 +247,12 @@ entry: } define void @shl0(ptr noalias %dst, ptr noalias %src) { -; NON-POW2-LABEL: @shl0( -; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 -; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 -; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 -; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = shl <3 x i32> [[TMP1]], -; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 -; NON-POW2-NEXT: ret void -; -; POW2-ONLY-LABEL: @shl0( -; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 -; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 -; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 -; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 -; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], -; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4 -; POW2-ONLY-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 -; POW2-ONLY-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4 -; POW2-ONLY-NEXT: ret void +; CHECK-LABEL: @shl0( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: ret void ; entry: %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll index a5b1e9b4575f0..769b3604d41c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll +++ b/llvm/test/Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll @@ -1,25 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %} define i1 @test(i32 %0, i32 %1, i32 %p) { -; CHECK-LABEL: define i1 @test( -; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]] -; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]] -; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]] -; CHECK-NEXT: ret i1 [[OP_RDX2]] +; X86-LABEL: define i1 @test( +; X86-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) { +; X86-NEXT: entry: +; X86-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 +; X86-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0 +; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; X86-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]] +; X86-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer +; X86-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]] +; X86-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] +; X86-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; X86-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]] +; X86-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]] +; X86-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]] +; X86-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]] +; X86-NEXT: ret i1 [[OP_RDX2]] +; +; AARCH64-LABEL: define i1 @test( +; AARCH64-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) { +; AARCH64-NEXT: entry: +; AARCH64-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0 +; AARCH64-NEXT: [[SHL4:%.*]] = shl i32 0, [[TMP1]] +; AARCH64-NEXT: [[CMP5:%.*]] = icmp slt i32 [[SHL4]], 0 +; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP1]], i32 1 +; AARCH64-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; AARCH64-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]] +; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[P]], i32 0 +; AARCH64-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP5]] +; AARCH64-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]] +; AARCH64-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]]) +; AARCH64-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP5]] +; AARCH64-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]] +; AARCH64-NEXT: [[TMP9:%.*]] = freeze i1 [[OP_RDX]] +; AARCH64-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 true, i1 [[OP_RDX1]] +; AARCH64-NEXT: ret i1 [[OP_RDX2]] ; entry: %cmp1 = icmp sgt i32 %0, 0