diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7928d29d6dfa7..9f8bc5527a6a4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -658,6 +658,29 @@ static InstructionsState getSameOpcode(ArrayRef VL, unsigned AltOpcode = Opcode; unsigned AltIndex = BaseIndex; + bool SwappedPredsCompatible = [&]() { + if (!IsCmpOp) + return false; + SetVector UniquePreds, UniqueNonSwappedPreds; + UniquePreds.insert(BasePred); + UniqueNonSwappedPreds.insert(BasePred); + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + return false; + CmpInst::Predicate CurrentPred = I->getPredicate(); + CmpInst::Predicate SwappedCurrentPred = + CmpInst::getSwappedPredicate(CurrentPred); + UniqueNonSwappedPreds.insert(CurrentPred); + if (!UniquePreds.contains(CurrentPred) && + !UniquePreds.contains(SwappedCurrentPred)) + UniquePreds.insert(CurrentPred); + } + // Total number of predicates > 2, but if consider swapped predicates + // compatible only 2, consider swappable predicates as compatible opcodes, + // not alternate. + return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2; + }(); // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). auto *IBase = cast(VL[BaseIndex]); @@ -710,7 +733,7 @@ static InstructionsState getSameOpcode(ArrayRef VL, CmpInst::Predicate SwappedCurrentPred = CmpInst::getSwappedPredicate(CurrentPred); - if (E == 2 && + if ((E == 2 || SwappedPredsCompatible) && (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) continue; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll new file mode 100644 index 0000000000000..6b270150985ef --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @test(ptr %sptr, i64 %0) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: ptr [[SPTR:%.*]], i64 [[TMP0:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP0]] to i32 +; CHECK-NEXT: [[IV2:%.*]] = getelementptr i8, ptr [[SPTR]], i64 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IV2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CONV_I]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = icmp sle <4 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP10]]) +; CHECK-NEXT: [[AND33:%.*]] = zext i1 [[TMP11]] to i32 +; CHECK-NEXT: ret i32 [[AND33]] +; +entry: + %conv.i = trunc i64 %0 to i32 + %iv2 = getelementptr i8, ptr %sptr, i64 4 + %1 = load i32, ptr %iv2, align 4 + %cmp11 = icmp slt i32 %1, %conv.i + %cmp.i57 = icmp eq i32 %1, 0 + %or.i5977 = or i1 %cmp.i57, %cmp11 + %iv4 = getelementptr i8, ptr %sptr, i64 12 + %2 = load i32, ptr %iv4, align 4 + %cmp16 = icmp sle i32 %2, %conv.i + %cmp.i62 = icmp eq i32 %2, 0 + %or.i6478 = or i1 %cmp.i62, %cmp16 + %iv3 = getelementptr i8, ptr %sptr, i64 8 + %3 = load i32, ptr %iv3, align 8 + %cmp21 = icmp sgt i32 %3, %conv.i + %cmp.i67 = icmp eq i32 %3, 0 + %or.i6979 = or i1 %cmp.i67, %cmp21 + %iv5 = getelementptr i8, ptr %sptr, i64 16 + %4 = load i32, ptr %iv5, align 8 + %cmp26 = icmp slt i32 %conv.i, 0 + %cmp.i72 = icmp eq i32 %4, 0 + %or.i7480 = or i1 %cmp.i72, %cmp26 + %and3183 = and i1 %or.i5977, %or.i6478 + %and3284 = and i1 %and3183, %or.i6979 + %and3385 = and i1 %and3284, %or.i7480 + %and33 = zext i1 %and3385 to i32 + ret i32 %and33 +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index b5a3c57414e78..acc04bece598a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -94,17 +94,13 @@ define i1 @logical_or_fcmp(<4 x float> %x) { define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) { ; SSE-LABEL: @logical_and_icmp_diff_preds( -; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0 -; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2 -; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0 -; SSE-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0 -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 -; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false -; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false -; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 -; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> , <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> , <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP4:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]] +; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> +; SSE-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]] +; SSE-NEXT: [[S3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]]) ; SSE-NEXT: ret i1 [[S3]] ; ; AVX-LABEL: @logical_and_icmp_diff_preds( @@ -391,17 +387,28 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { } define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { -; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], -; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], -; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false -; CHECK-NEXT: ret i1 [[OP_RDX]] +; SSE-LABEL: @logical_and_icmp_clamp_pred_diff( +; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], +; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> +; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], +; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] +; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]]) +; SSE-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]] +; SSE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]]) +; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false +; SSE-NEXT: ret i1 [[OP_RDX]] +; +; AVX-LABEL: @logical_and_icmp_clamp_pred_diff( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]] +; AVX-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]] +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> +; AVX-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] +; AVX-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) +; AVX-NEXT: ret i1 [[TMP8]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1