Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10620,7 +10620,8 @@ class InstructionsCompatibilityAnalysis {
/// Checks if the opcode is supported as the main opcode for copyable
/// elements.
static bool isSupportedOpcode(const unsigned Opcode) {
return Opcode == Instruction::Add || Opcode == Instruction::LShr;
return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
Opcode == Instruction::Shl;
}

/// Identifies the best candidate value, which represents main opcode
Expand Down Expand Up @@ -10937,6 +10938,7 @@ class InstructionsCompatibilityAnalysis {
switch (MainOpcode) {
case Instruction::Add:
case Instruction::LShr:
case Instruction::Shl:
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
break;
default:
Expand Down Expand Up @@ -22006,6 +22008,8 @@ bool BoUpSLP::collectValuesToDemote(
return all_of(E.Scalars, [&](Value *V) {
if (isa<PoisonValue>(V))
return true;
if (E.isCopyableElement(V))
return true;
auto *I = cast<Instruction>(V);
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
return AmtKnownBits.getMaxValue().ult(BitWidth);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,10 @@ define void @test() {
; CHECK-NEXT: [[SUB4_I_I65_US:%.*]] = or i64 0, 1
; CHECK-NEXT: br label [[BODY:%.*]]
; CHECK: body:
; CHECK-NEXT: [[ADD_I_I62_US:%.*]] = shl i64 0, 0
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> <i64 poison, i64 1>, i64 [[ADD_I_I62_US]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> [[TMP2]], i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> getelementptr ([[CLASS_A:%.*]], <2 x ptr> zeroinitializer, <2 x i64> <i64 0, i64 1>), i32 4, <2 x i1> splat (i1 true), <2 x i32> poison)
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
; CHECK-NEXT: [[CMP_I_I_I_I67_US:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
; CHECK-NEXT: [[SPEC_SELECT_I_I68_US:%.*]] = select i1 false, i64 [[SUB4_I_I65_US]], i64 0
; CHECK-NEXT: br label [[BODY]]
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ define i32 @test() {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: store i32 152, ptr @f, align 4
; CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD_I:%.*]] = load i32, ptr @f, align 4
; CHECK-NEXT: [[ADD_I_I:%.*]] = shl i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], 24
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[ADD_I_I]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> <i32 poison, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080, i32 83886080>, i32 [[AGG_TMP_SROA_0_0_COPYLOAD_I]], i32 0
; CHECK-NEXT: [[TMP0:%.*]] = shl <8 x i32> [[TMP3]], <i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> <i32 83886080, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i32> [[TMP1]], splat (i32 24)
; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i32> [[TMP2]], <i32 66440127, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
Expand Down
32 changes: 6 additions & 26 deletions llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -247,32 +247,12 @@ entry:
}

define void @shl0(ptr noalias %dst, ptr noalias %src) {
; NON-POW2-LABEL: @shl0(
; NON-POW2-NEXT: entry:
; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4
; NON-POW2-NEXT: [[TMP2:%.*]] = shl <3 x i32> [[TMP1]], <i32 1, i32 2, i32 3>
; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
; NON-POW2-NEXT: ret void
;
; POW2-ONLY-LABEL: @shl0(
; POW2-ONLY-NEXT: entry:
; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1
; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1
; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4
; POW2-ONLY-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3
; POW2-ONLY-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3
; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4
; POW2-ONLY-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 2>
; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4
; POW2-ONLY-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3
; POW2-ONLY-NEXT: store i32 [[SHL8]], ptr [[INCDEC_PTR6]], align 4
; POW2-ONLY-NEXT: ret void
; CHECK-LABEL: @shl0(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = shl <4 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4
; CHECK-NEXT: ret void
;
entry:
%incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s %}
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s %}
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X86 %}
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=AARCH64 %}


define i1 @test(i32 %0, i32 %1, i32 %p) {
; CHECK-LABEL: define i1 @test(
; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]]
; CHECK-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
; CHECK-NEXT: ret i1 [[OP_RDX2]]
; X86-LABEL: define i1 @test(
; X86-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
; X86-NEXT: entry:
; X86-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
; X86-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
; X86-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
; X86-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
; X86-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer
; X86-NEXT: [[CMP6:%.*]] = icmp slt i32 0, [[P]]
; X86-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
; X86-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
; X86-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP6]]
; X86-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
; X86-NEXT: [[TMP8:%.*]] = freeze i1 [[OP_RDX]]
; X86-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP8]], i1 true, i1 [[OP_RDX1]]
; X86-NEXT: ret i1 [[OP_RDX2]]
;
; AARCH64-LABEL: define i1 @test(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is surprising given aarch64 has better non-uniform vector shift handling than x86/SSE

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alexey-bataev were you able to get to the bottom of this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, this test shows it. X86 fails to vectorize initial reduction

    %cmp6 = icmp slt i32 0, %p
    %cmp2 = icmp slt i32 %shl1, 0
    %cmp3 = icmp slt i32 %shl2, 0
    %cmp4 = icmp slt i32 %shl3, 0

while AArch64 is able to vectorize it.
X86 instead vectorizes next candidate:

    %cmp2 = icmp slt i32 %shl1, 0
    %cmp3 = icmp slt i32 %shl2, 0
    %cmp4 = icmp slt i32 %shl3, 0
    %cmp5 = icmp slt i32 %shl4, 0

AArch64 should do it better with enabled non-power-of-2

; AARCH64-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[P:%.*]]) {
; AARCH64-NEXT: entry:
; AARCH64-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP0]], 0
; AARCH64-NEXT: [[SHL4:%.*]] = shl i32 0, [[TMP1]]
; AARCH64-NEXT: [[CMP5:%.*]] = icmp slt i32 [[SHL4]], 0
; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, i32 [[TMP1]], i32 1
; AARCH64-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
; AARCH64-NEXT: [[TMP4:%.*]] = shl <4 x i32> zeroinitializer, [[TMP3]]
; AARCH64-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[P]], i32 0
; AARCH64-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP4]], [[TMP5]]
; AARCH64-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP6]]
; AARCH64-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP7]])
; AARCH64-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP8]], i1 true, i1 [[CMP5]]
; AARCH64-NEXT: [[OP_RDX1:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP1]]
; AARCH64-NEXT: [[TMP9:%.*]] = freeze i1 [[OP_RDX]]
; AARCH64-NEXT: [[OP_RDX2:%.*]] = select i1 [[TMP9]], i1 true, i1 [[OP_RDX1]]
; AARCH64-NEXT: ret i1 [[OP_RDX2]]
;
entry:
%cmp1 = icmp sgt i32 %0, 0
Expand Down