diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 3e75b5ed48577..f49ca17a67cb4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8985,6 +8985,20 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { if (isFullyVectorizableTinyTree(ForReduction)) return false; + // Check if any of the gather node forms an insertelement buildvector + // somewhere. + if (any_of(VectorizableTree, [](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather && + all_of(TE->Scalars, [](Value *V) { + return isa(V) || + (!V->hasNUsesOrMore(UsesLimit) && + any_of(V->users(), [](User *U) { + return isa(U); + })); + }); + })) + return false; + assert(VectorizableTree.empty() ? ExternalUses.empty() : true && "We shouldn't have any external users"); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 97008ac678596..ed73f7b134465 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -1,375 +1,362 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 -; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-40 | FileCheck %s +; RUN: opt -S -mtriple riscv64-unknown-linux-gnu < %s --passes=slp-vectorizer -mattr=+v -slp-threshold=-20 | FileCheck %s define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.ptr, ptr %add.ptr64) { ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP3]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr i8, ptr [[PIX1]], i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 2 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 +; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP10]] to i32 ; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP9]] to i32 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ADD_PTR644]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP11]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ADD_PTR644]], align 1 ; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 ; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2 -; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1 ; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1 -; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1 +; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP14]] to i32 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 +; CHECK-NEXT: [[TMP15:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], +; CHECK-NEXT: [[TMP26:%.*]] = add <2 x i32> [[TMP25]], [[TMP19]] ; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2 -; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX20_2]], align 1 ; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2 -; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX22_2]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i8> poison, i8 [[TMP13]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[TMP15]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> poison, i8 [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i8> [[TMP20]], i8 [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP19]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR_1]], i32 0 -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x ptr> [[TMP24]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr i8, <2 x ptr> [[TMP25]], <2 x i64> -; CHECK-NEXT: [[TMP27:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP26]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6 +; CHECK-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6 +; CHECK-NEXT: [[TMP27:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1 ; CHECK-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR64_1]], i32 0 -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x ptr> [[TMP29]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, <2 x ptr> [[TMP30]], <2 x i64> -; CHECK-NEXT: [[TMP32:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP31]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP29:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1 +; CHECK-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = sub <2 x i32> [[TMP28]], [[TMP30]] +; CHECK-NEXT: [[TMP32:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1 ; CHECK-NEXT: [[TMP33:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = sub <2 x i32> [[TMP28]], [[TMP33]] -; CHECK-NEXT: [[TMP35:%.*]] = shl <2 x i32> [[TMP34]], -; CHECK-NEXT: [[TMP36:%.*]] = add <2 x i32> [[TMP35]], [[TMP23]] -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i8, <2 x ptr> [[TMP25]], <2 x i64> -; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP37]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i8, <2 x ptr> [[TMP30]], <2 x i64> -; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP40]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = sub <2 x i32> [[TMP39]], [[TMP42]] -; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i8, <2 x ptr> [[TMP25]], <2 x i64> -; CHECK-NEXT: [[TMP45:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP44]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP46:%.*]] = zext <2 x i8> [[TMP45]] to <2 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = getelementptr i8, <2 x ptr> [[TMP30]], <2 x i64> -; CHECK-NEXT: [[TMP48:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP47]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = sub <2 x i32> [[TMP46]], [[TMP49]] -; CHECK-NEXT: [[TMP51:%.*]] = shl <2 x i32> [[TMP50]], -; CHECK-NEXT: [[TMP52:%.*]] = add <2 x i32> [[TMP51]], [[TMP43]] -; CHECK-NEXT: [[TMP53:%.*]] = sub <2 x i32> [[TMP36]], [[TMP52]] -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <2 x i32> [[TMP53]], i32 0 -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <2 x i32> [[TMP53]], i32 1 -; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP54]], [[TMP55]] -; CHECK-NEXT: [[TMP56:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP34:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1 +; CHECK-NEXT: [[TMP35:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], +; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP37]], [[TMP31]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0 +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1 +; CHECK-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP40]], [[TMP39]] +; CHECK-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP39]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0 +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1 +; CHECK-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP42]], [[TMP41]] +; CHECK-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP41]], [[TMP42]] +; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1 ; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 ; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 -; CHECK-NEXT: [[TMP57:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP58:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX20_3]], i32 0 -; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP58]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP45:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX20_3]], i32 0 +; CHECK-NEXT: [[TMP46:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP45]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32> +; CHECK-NEXT: [[TMP48:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX22_3]], i32 0 +; CHECK-NEXT: [[TMP49:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP48]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = sub <2 x i32> [[TMP47]], [[TMP50]] +; CHECK-NEXT: [[TMP52:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], +; CHECK-NEXT: [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP51]] +; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) ; CHECK-NEXT: [[TMP60:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX22_3]], i32 0 -; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP61]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32> -; CHECK-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP60]], [[TMP63]] -; CHECK-NEXT: [[TMP65:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP61:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP61]] to <2 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = sub <2 x i32> [[TMP60]], [[TMP62]] +; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0 +; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1 ; CHECK-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) ; CHECK-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> ; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]] ; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], -; CHECK-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]] -; CHECK-NEXT: [[TMP72:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP73:%.*]] = zext <2 x i8> [[TMP72]] to <2 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP75:%.*]] = zext <2 x i8> [[TMP74]] to <2 x i32> -; CHECK-NEXT: [[TMP76:%.*]] = sub <2 x i32> [[TMP73]], [[TMP75]] -; CHECK-NEXT: [[TMP77:%.*]] = insertelement <2 x i8> poison, i8 [[TMP57]], i32 0 -; CHECK-NEXT: [[TMP78:%.*]] = insertelement <2 x i8> [[TMP77]], i8 [[TMP56]], i32 1 -; CHECK-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32> -; CHECK-NEXT: [[TMP80:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP81:%.*]] = zext <2 x i8> [[TMP80]] to <2 x i32> -; CHECK-NEXT: [[TMP82:%.*]] = sub <2 x i32> [[TMP79]], [[TMP81]] -; CHECK-NEXT: [[TMP83:%.*]] = shl <2 x i32> [[TMP82]], -; CHECK-NEXT: [[TMP84:%.*]] = add <2 x i32> [[TMP83]], [[TMP76]] -; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP71]], [[TMP84]] -; CHECK-NEXT: [[TMP86:%.*]] = shufflevector <2 x i32> [[TMP84]], <2 x i32> [[TMP52]], <2 x i32> -; CHECK-NEXT: [[TMP87:%.*]] = shufflevector <2 x i32> [[TMP71]], <2 x i32> [[TMP36]], <2 x i32> -; CHECK-NEXT: [[TMP88:%.*]] = add <2 x i32> [[TMP86]], [[TMP87]] -; CHECK-NEXT: [[TMP89:%.*]] = shufflevector <2 x i32> [[TMP84]], <2 x i32> [[TMP52]], <2 x i32> -; CHECK-NEXT: [[TMP90:%.*]] = shufflevector <2 x i32> [[TMP71]], <2 x i32> [[TMP36]], <2 x i32> -; CHECK-NEXT: [[TMP91:%.*]] = add <2 x i32> [[TMP89]], [[TMP90]] -; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP91]], [[TMP88]] -; CHECK-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP88]], [[TMP91]] -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0 -; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 -; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP95]], [[TMP94]] -; CHECK-NEXT: [[TMP96:%.*]] = extractelement <2 x i32> [[TMP92]], i32 0 -; CHECK-NEXT: [[TMP97:%.*]] = extractelement <2 x i32> [[TMP92]], i32 1 -; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP96]], [[TMP97]] -; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP97]], [[TMP96]] -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP60]], i32 1 -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP98]], 15 +; CHECK-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP63]] +; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP58]] +; CHECK-NEXT: [[TMP73:%.*]] = sub <2 x i32> [[TMP58]], [[TMP71]] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0 +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1 +; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP74]], [[TMP75]] +; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]] +; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]] +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1 +; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP76]], 15 ; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 ; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; CHECK-NEXT: [[TMP99:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1 -; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP99]], 15 +; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15 ; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 ; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; CHECK-NEXT: [[TMP100:%.*]] = extractelement <2 x i32> [[TMP93]], i32 0 -; CHECK-NEXT: [[TMP101:%.*]] = extractelement <2 x i32> [[TMP93]], i32 1 -; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP100]], [[TMP101]] -; CHECK-NEXT: [[TMP102:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1 -; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32> -; CHECK-NEXT: [[TMP104:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0 -; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP106:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0 -; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <2 x i32> [[TMP106]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP108:%.*]] = add <2 x i32> [[TMP105]], [[TMP107]] -; CHECK-NEXT: [[TMP109:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]] -; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP109]], <2 x i32> -; CHECK-NEXT: [[TMP111:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1 -; CHECK-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP111]] to <2 x i32> -; CHECK-NEXT: [[TMP113:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP114:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1 -; CHECK-NEXT: [[TMP115:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP114]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP116:%.*]] = zext <2 x i8> [[TMP115]] to <2 x i32> -; CHECK-NEXT: [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 +; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP77]], 15 +; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 +; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 +; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 +; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 +; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 +; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15 +; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 +; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 +; CHECK-NEXT: [[TMP78:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 +; CHECK-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP78]] to <2 x i32> +; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x ptr> [[TMP5]], ptr [[ARRAYIDX22]], i32 1 +; CHECK-NEXT: [[TMP81:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP80]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32> +; CHECK-NEXT: [[TMP83:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP3]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP84:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32> +; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP86:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32> +; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP84]], [[TMP86]] +; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], +; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP90:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32> +; CHECK-NEXT: [[TMP91:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP92:%.*]] = zext <2 x i8> [[TMP91]] to <2 x i32> +; CHECK-NEXT: [[TMP93:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP9]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32> +; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP92]], [[TMP94]] +; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], +; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP79]], i32 [[CONV33]], i32 1 +; CHECK-NEXT: [[TMP98:%.*]] = sub <2 x i32> [[TMP97]], [[TMP90]] +; CHECK-NEXT: [[TMP99:%.*]] = add <2 x i32> [[TMP96]], [[TMP98]] +; CHECK-NEXT: [[TMP100:%.*]] = insertelement <2 x i32> [[TMP79]], i32 [[CONV]], i32 0 +; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP100]], [[TMP82]] +; CHECK-NEXT: [[TMP102:%.*]] = add <2 x i32> [[TMP88]], [[TMP101]] +; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> [[TMP102]], <2 x i32> +; CHECK-NEXT: [[TMP104:%.*]] = add <2 x i32> [[TMP99]], [[TMP102]] +; CHECK-NEXT: [[TMP105:%.*]] = sub <2 x i32> [[TMP102]], [[TMP99]] +; CHECK-NEXT: [[TMP106:%.*]] = extractelement <2 x i32> [[TMP104]], i32 0 +; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP104]], i32 1 +; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP107]], [[TMP106]] +; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP105]], i32 1 +; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP107]], 15 +; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 +; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 +; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15 +; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 +; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 +; CHECK-NEXT: [[TMP109:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> +; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i8> poison, i8 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP112:%.*]] = insertelement <2 x i8> [[TMP111]], i8 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32> +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0 +; CHECK-NEXT: [[TMP115:%.*]] = shufflevector <2 x ptr> [[TMP114]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP116:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> +; CHECK-NEXT: [[TMP117:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP116]], i32 1, <2 x i1> , <2 x i8> poison) ; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32> -; CHECK-NEXT: [[TMP119:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP119]] to <2 x i32> -; CHECK-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP118]], [[TMP120]] -; CHECK-NEXT: [[TMP122:%.*]] = shl <2 x i32> [[TMP121]], -; CHECK-NEXT: [[TMP123:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32> -; CHECK-NEXT: [[TMP125:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32> -; CHECK-NEXT: [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP119:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0 +; CHECK-NEXT: [[TMP120:%.*]] = shufflevector <2 x ptr> [[TMP119]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP121:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> +; CHECK-NEXT: [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP121]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32> +; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP118]], [[TMP123]] +; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], +; CHECK-NEXT: [[TMP126:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> +; CHECK-NEXT: [[TMP127:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP126]], i32 1, <2 x i1> , <2 x i8> poison) ; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> -; CHECK-NEXT: [[TMP129:%.*]] = sub <2 x i32> [[TMP126]], [[TMP128]] -; CHECK-NEXT: [[TMP130:%.*]] = shl <2 x i32> [[TMP129]], -; CHECK-NEXT: [[TMP131:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> [[TMP103]], <2 x i32> -; CHECK-NEXT: [[TMP132:%.*]] = sub <2 x i32> [[TMP131]], [[TMP116]] -; CHECK-NEXT: [[TMP133:%.*]] = add <2 x i32> [[TMP122]], [[TMP132]] -; CHECK-NEXT: [[TMP134:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP103]], <2 x i32> -; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP134]], [[TMP124]] -; CHECK-NEXT: [[TMP136:%.*]] = add <2 x i32> [[TMP130]], [[TMP135]] -; CHECK-NEXT: [[TMP137:%.*]] = extractelement <2 x i32> [[TMP133]], i32 1 -; CHECK-NEXT: [[TMP138:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1 -; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[TMP138]], [[TMP137]] -; CHECK-NEXT: [[TMP139:%.*]] = sub <2 x i32> [[TMP133]], [[TMP136]] -; CHECK-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP133]], i32 0 -; CHECK-NEXT: [[TMP141:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0 -; CHECK-NEXT: [[ADD44:%.*]] = add i32 [[TMP141]], [[TMP140]] -; CHECK-NEXT: [[TMP142:%.*]] = lshr <2 x i32> [[TMP113]], -; CHECK-NEXT: [[TMP143:%.*]] = and <2 x i32> [[TMP142]], -; CHECK-NEXT: [[TMP144:%.*]] = mul <2 x i32> [[TMP143]], -; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP139]], i32 0 -; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP139]], i32 1 -; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP145]], [[TMP146]] -; CHECK-NEXT: [[TMP147:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; CHECK-NEXT: [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32> -; CHECK-NEXT: [[TMP149:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP150:%.*]] = insertelement <2 x i8> [[TMP149]], i8 [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP151:%.*]] = zext <2 x i8> [[TMP150]] to <2 x i32> -; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0 -; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP152]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> -; CHECK-NEXT: [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32> -; CHECK-NEXT: [[TMP157:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0 -; CHECK-NEXT: [[TMP158:%.*]] = shufflevector <2 x ptr> [[TMP157]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP158]], <2 x i64> -; CHECK-NEXT: [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32> -; CHECK-NEXT: [[TMP162:%.*]] = sub <2 x i32> [[TMP156]], [[TMP161]] -; CHECK-NEXT: [[TMP163:%.*]] = shl <2 x i32> [[TMP162]], -; CHECK-NEXT: [[TMP164:%.*]] = getelementptr i8, <2 x ptr> [[TMP158]], <2 x i64> -; CHECK-NEXT: [[TMP165:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP164]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32> -; CHECK-NEXT: [[TMP167:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> -; CHECK-NEXT: [[TMP168:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP167]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP169:%.*]] = zext <2 x i8> [[TMP168]] to <2 x i32> -; CHECK-NEXT: [[TMP170:%.*]] = getelementptr i8, <2 x ptr> [[TMP158]], <2 x i64> -; CHECK-NEXT: [[TMP171:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP170]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP172:%.*]] = zext <2 x i8> [[TMP171]] to <2 x i32> -; CHECK-NEXT: [[TMP173:%.*]] = sub <2 x i32> [[TMP169]], [[TMP172]] -; CHECK-NEXT: [[TMP174:%.*]] = shl <2 x i32> [[TMP173]], -; CHECK-NEXT: [[TMP175:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP176:%.*]] = sub <2 x i32> [[TMP175]], [[TMP166]] -; CHECK-NEXT: [[TMP177:%.*]] = add <2 x i32> [[TMP174]], [[TMP176]] -; CHECK-NEXT: [[TMP178:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP179:%.*]] = sub <2 x i32> [[TMP178]], [[TMP151]] -; CHECK-NEXT: [[TMP180:%.*]] = add <2 x i32> [[TMP163]], [[TMP179]] -; CHECK-NEXT: [[TMP181:%.*]] = add <2 x i32> [[TMP177]], [[TMP180]] -; CHECK-NEXT: [[TMP182:%.*]] = sub <2 x i32> [[TMP180]], [[TMP177]] -; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP181]], i32 0 -; CHECK-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP181]], i32 1 -; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP183]], [[TMP184]] -; CHECK-NEXT: [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP182]], <2 x i32> [[TMP139]], <2 x i32> -; CHECK-NEXT: [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP182]], <2 x i32> [[TMP139]], <2 x i32> -; CHECK-NEXT: [[TMP187:%.*]] = add <2 x i32> [[TMP185]], [[TMP186]] -; CHECK-NEXT: [[TMP188:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0 -; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1 -; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP188]], [[TMP189]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP184]], 15 +; CHECK-NEXT: [[TMP129:%.*]] = getelementptr i8, <2 x ptr> [[TMP115]], <2 x i64> +; CHECK-NEXT: [[TMP130:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP129]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32> +; CHECK-NEXT: [[TMP132:%.*]] = getelementptr i8, <2 x ptr> [[TMP120]], <2 x i64> +; CHECK-NEXT: [[TMP133:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP132]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> +; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP131]], [[TMP134]] +; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], +; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1 +; CHECK-NEXT: [[TMP138:%.*]] = sub <2 x i32> [[TMP137]], [[TMP128]] +; CHECK-NEXT: [[TMP139:%.*]] = add <2 x i32> [[TMP136]], [[TMP138]] +; CHECK-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0 +; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP113]] +; CHECK-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]] +; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP139]], [[TMP142]] +; CHECK-NEXT: [[TMP144:%.*]] = sub <2 x i32> [[TMP142]], [[TMP139]] +; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0 +; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1 +; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]] +; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP146]], 15 ; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 ; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; CHECK-NEXT: [[TMP190:%.*]] = lshr <2 x i32> [[TMP148]], -; CHECK-NEXT: [[TMP191:%.*]] = and <2 x i32> [[TMP190]], -; CHECK-NEXT: [[TMP192:%.*]] = mul <2 x i32> [[TMP191]], -; CHECK-NEXT: [[TMP193:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0 -; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP193]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP195:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0 -; CHECK-NEXT: [[TMP196:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1 -; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP195]], [[TMP196]] -; CHECK-NEXT: [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> [[TMP182]], <2 x i32> -; CHECK-NEXT: [[TMP198:%.*]] = lshr <2 x i32> [[TMP197]], -; CHECK-NEXT: [[TMP199:%.*]] = and <2 x i32> [[TMP198]], -; CHECK-NEXT: [[TMP200:%.*]] = mul <2 x i32> [[TMP199]], -; CHECK-NEXT: [[TMP201:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0 -; CHECK-NEXT: [[TMP202:%.*]] = shufflevector <2 x i32> [[TMP201]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP203:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 -; CHECK-NEXT: [[TMP204:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP205:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0 -; CHECK-NEXT: [[TMP206:%.*]] = shufflevector <2 x i32> [[TMP205]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP207:%.*]] = insertelement <2 x i32> , i32 [[ADD46]], i32 1 -; CHECK-NEXT: [[TMP208:%.*]] = lshr <2 x i32> [[TMP206]], [[TMP207]] -; CHECK-NEXT: [[TMP209:%.*]] = sub <2 x i32> [[TMP206]], [[TMP207]] -; CHECK-NEXT: [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> -; CHECK-NEXT: [[TMP211:%.*]] = extractelement <2 x i32> [[TMP210]], i32 1 -; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP211]] -; CHECK-NEXT: [[TMP212:%.*]] = insertelement <2 x i32> , i32 [[SUB51_1]], i32 1 -; CHECK-NEXT: [[TMP213:%.*]] = and <2 x i32> [[TMP210]], [[TMP212]] -; CHECK-NEXT: [[TMP214:%.*]] = sub <2 x i32> [[TMP210]], [[TMP212]] -; CHECK-NEXT: [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> -; CHECK-NEXT: [[TMP216:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 -; CHECK-NEXT: [[TMP217:%.*]] = shufflevector <2 x i32> [[TMP216]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP218:%.*]] = add <2 x i32> [[TMP217]], [[TMP204]] -; CHECK-NEXT: [[TMP219:%.*]] = sub <2 x i32> [[TMP217]], [[TMP204]] -; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP218]], <2 x i32> [[TMP219]], <2 x i32> -; CHECK-NEXT: [[TMP221:%.*]] = insertelement <2 x i32> [[TMP139]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP222:%.*]] = lshr <2 x i32> [[TMP221]], -; CHECK-NEXT: [[TMP223:%.*]] = and <2 x i32> [[TMP222]], -; CHECK-NEXT: [[TMP224:%.*]] = mul <2 x i32> [[TMP223]], -; CHECK-NEXT: [[TMP225:%.*]] = shufflevector <2 x i32> [[TMP93]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP226:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP187]], <2 x i32> -; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP93]], <2 x i32> [[TMP187]], <2 x i32> -; CHECK-NEXT: [[TMP228:%.*]] = sub <2 x i32> [[TMP226]], [[TMP227]] -; CHECK-NEXT: [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP53]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP230:%.*]] = insertelement <2 x i32> [[TMP229]], i32 [[ADD46]], i32 1 -; CHECK-NEXT: [[TMP231:%.*]] = insertelement <2 x i32> [[TMP53]], i32 [[ADD44]], i32 1 -; CHECK-NEXT: [[TMP232:%.*]] = add <2 x i32> [[TMP230]], [[TMP231]] -; CHECK-NEXT: [[TMP233:%.*]] = shufflevector <2 x i32> [[TMP85]], <2 x i32> [[TMP181]], <2 x i32> -; CHECK-NEXT: [[TMP234:%.*]] = shufflevector <2 x i32> [[TMP85]], <2 x i32> [[TMP181]], <2 x i32> -; CHECK-NEXT: [[TMP235:%.*]] = add <2 x i32> [[TMP233]], [[TMP234]] -; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP232]], i32 0 -; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP235]], i32 0 -; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP237]], [[TMP236]] -; CHECK-NEXT: [[TMP238:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[ADD46]], i32 1 -; CHECK-NEXT: [[TMP239:%.*]] = lshr <2 x i32> [[TMP238]], -; CHECK-NEXT: [[TMP240:%.*]] = and <2 x i32> [[TMP239]], -; CHECK-NEXT: [[TMP241:%.*]] = mul <2 x i32> [[TMP240]], -; CHECK-NEXT: [[TMP242:%.*]] = extractelement <2 x i32> [[TMP232]], i32 1 -; CHECK-NEXT: [[TMP243:%.*]] = extractelement <2 x i32> [[TMP235]], i32 1 -; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[TMP243]], [[TMP242]] -; CHECK-NEXT: [[TMP244:%.*]] = sub <2 x i32> [[TMP232]], [[TMP235]] +; CHECK-NEXT: [[TMP147:%.*]] = lshr <2 x i32> [[TMP110]], +; CHECK-NEXT: [[TMP148:%.*]] = and <2 x i32> [[TMP147]], +; CHECK-NEXT: [[TMP149:%.*]] = mul <2 x i32> [[TMP148]], +; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]] +; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]] ; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] ; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] -; CHECK-NEXT: [[TMP245:%.*]] = extractelement <2 x i32> [[TMP244]], i32 1 -; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP245]] +; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]] +; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]] ; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] -; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP98]] +; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP76]] ; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP99]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]] ; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP184]] +; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]] +; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] +; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP107]] ; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; CHECK-NEXT: [[TMP246:%.*]] = shufflevector <2 x i32> [[TMP228]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP247:%.*]] = insertelement <2 x i32> [[TMP246]], i32 [[SUB102]], i32 1 -; CHECK-NEXT: [[TMP248:%.*]] = add <2 x i32> [[TMP244]], [[TMP247]] -; CHECK-NEXT: [[TMP249:%.*]] = sub <2 x i32> [[TMP244]], [[TMP247]] -; CHECK-NEXT: [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP248]], <2 x i32> [[TMP249]], <2 x i32> -; CHECK-NEXT: [[TMP251:%.*]] = add <2 x i32> [[TMP241]], [[TMP250]] -; CHECK-NEXT: [[TMP252:%.*]] = xor <2 x i32> [[TMP251]], [[TMP238]] -; CHECK-NEXT: [[TMP253:%.*]] = extractelement <2 x i32> [[TMP252]], i32 1 -; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP253]] -; CHECK-NEXT: [[TMP254:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0 -; CHECK-NEXT: [[TMP255:%.*]] = shufflevector <2 x i32> [[TMP254]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP256:%.*]] = add <2 x i32> [[TMP202]], [[TMP255]] -; CHECK-NEXT: [[TMP257:%.*]] = sub <2 x i32> [[TMP202]], [[TMP255]] -; CHECK-NEXT: [[TMP258:%.*]] = shufflevector <2 x i32> [[TMP256]], <2 x i32> [[TMP257]], <2 x i32> -; CHECK-NEXT: [[TMP259:%.*]] = add <2 x i32> [[TMP200]], [[TMP258]] -; CHECK-NEXT: [[TMP260:%.*]] = xor <2 x i32> [[TMP259]], [[TMP197]] -; CHECK-NEXT: [[TMP261:%.*]] = extractelement <2 x i32> [[TMP252]], i32 0 -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP261]], [[ADD113]] -; CHECK-NEXT: [[TMP262:%.*]] = extractelement <2 x i32> [[TMP260]], i32 0 -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP262]] -; CHECK-NEXT: [[TMP263:%.*]] = extractelement <2 x i32> [[TMP260]], i32 1 -; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP263]] -; CHECK-NEXT: [[TMP264:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP265:%.*]] = shufflevector <2 x i32> [[TMP264]], <2 x i32> [[TMP244]], <2 x i32> -; CHECK-NEXT: [[TMP266:%.*]] = add <2 x i32> [[TMP228]], [[TMP265]] -; CHECK-NEXT: [[TMP267:%.*]] = sub <2 x i32> [[TMP228]], [[TMP265]] -; CHECK-NEXT: [[TMP268:%.*]] = shufflevector <2 x i32> [[TMP266]], <2 x i32> [[TMP267]], <2 x i32> -; CHECK-NEXT: [[TMP269:%.*]] = add <2 x i32> [[TMP224]], [[TMP268]] -; CHECK-NEXT: [[TMP270:%.*]] = xor <2 x i32> [[TMP269]], [[TMP221]] -; CHECK-NEXT: [[TMP271:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1 -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP271]] -; CHECK-NEXT: [[TMP272:%.*]] = shufflevector <2 x i32> , <2 x i32> [[TMP228]], <2 x i32> -; CHECK-NEXT: [[TMP273:%.*]] = mul <2 x i32> [[TMP215]], [[TMP272]] -; CHECK-NEXT: [[TMP274:%.*]] = sub <2 x i32> [[TMP215]], [[TMP272]] -; CHECK-NEXT: [[TMP275:%.*]] = shufflevector <2 x i32> [[TMP273]], <2 x i32> [[TMP274]], <2 x i32> -; CHECK-NEXT: [[TMP276:%.*]] = add <2 x i32> [[TMP192]], [[TMP220]] -; CHECK-NEXT: [[TMP277:%.*]] = xor <2 x i32> [[TMP276]], [[TMP148]] -; CHECK-NEXT: [[TMP278:%.*]] = extractelement <2 x i32> [[TMP275]], i32 0 -; CHECK-NEXT: [[TMP279:%.*]] = extractelement <2 x i32> [[TMP275]], i32 1 -; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[TMP278]], [[TMP279]] -; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]] -; CHECK-NEXT: [[TMP280:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0 -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP280]], [[ADD113_1]] -; CHECK-NEXT: [[TMP281:%.*]] = extractelement <2 x i32> [[TMP277]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP281]] -; CHECK-NEXT: [[TMP282:%.*]] = extractelement <2 x i32> [[TMP277]], i32 1 -; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP282]] +; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] +; CHECK-NEXT: [[TMP150:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB47_2]], i32 1 +; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_2]], i32 1 +; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP151]], [[TMP152]] +; CHECK-NEXT: [[TMP154:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> +; CHECK-NEXT: [[TMP155:%.*]] = shufflevector <2 x i32> [[TMP144]], <2 x i32> [[TMP73]], <2 x i32> +; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP154]], [[TMP155]] +; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP153]], i32 1 +; CHECK-NEXT: [[TMP158:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1 +; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> +; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP158]], [[TMP157]] +; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP153]], i32 0 +; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0 +; CHECK-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP153]], <2 x i32> +; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP161]], [[TMP160]] +; CHECK-NEXT: [[TMP163:%.*]] = sub <2 x i32> [[TMP153]], [[TMP156]] +; CHECK-NEXT: [[TMP164:%.*]] = extractelement <2 x i32> [[TMP163]], i32 0 +; CHECK-NEXT: [[TMP165:%.*]] = extractelement <2 x i32> [[TMP163]], i32 1 +; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[TMP165]], [[TMP164]] +; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[TMP164]], [[TMP165]] +; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] +; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP77]] +; CHECK-NEXT: [[TMP166:%.*]] = shufflevector <2 x i32> [[TMP16]], <2 x i32> [[TMP144]], <2 x i32> +; CHECK-NEXT: [[TMP167:%.*]] = lshr <2 x i32> [[TMP166]], +; CHECK-NEXT: [[TMP168:%.*]] = and <2 x i32> [[TMP167]], +; CHECK-NEXT: [[TMP169:%.*]] = mul <2 x i32> [[TMP168]], +; CHECK-NEXT: [[TMP170:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0 +; CHECK-NEXT: [[TMP171:%.*]] = shufflevector <2 x i32> [[TMP170]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0 +; CHECK-NEXT: [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP174:%.*]] = add <2 x i32> [[TMP171]], [[TMP173]] +; CHECK-NEXT: [[TMP175:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]] +; CHECK-NEXT: [[TMP176:%.*]] = shufflevector <2 x i32> [[TMP174]], <2 x i32> [[TMP175]], <2 x i32> +; CHECK-NEXT: [[TMP177:%.*]] = add <2 x i32> [[TMP169]], [[TMP176]] +; CHECK-NEXT: [[TMP178:%.*]] = xor <2 x i32> [[TMP177]], [[TMP166]] +; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] +; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP108]] +; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] +; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP178]], i32 0 +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP179]] +; CHECK-NEXT: [[TMP180:%.*]] = extractelement <2 x i32> [[TMP178]], i32 1 +; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP180]] +; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] +; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP104]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP182:%.*]] = insertelement <2 x i32> [[TMP181]], i32 [[ADD44_2]], i32 0 +; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> [[TMP104]], i32 [[ADD46_2]], i32 0 +; CHECK-NEXT: [[TMP184:%.*]] = sub <2 x i32> [[TMP182]], [[TMP183]] +; CHECK-NEXT: [[TMP185:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> +; CHECK-NEXT: [[TMP186:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> [[TMP143]], <2 x i32> +; CHECK-NEXT: [[TMP187:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]] +; CHECK-NEXT: [[TMP188:%.*]] = extractelement <2 x i32> [[TMP184]], i32 0 +; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP187]], i32 0 +; CHECK-NEXT: [[TMP190:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> +; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP189]], [[TMP188]] +; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP184]], i32 1 +; CHECK-NEXT: [[TMP192:%.*]] = extractelement <2 x i32> [[TMP187]], i32 1 +; CHECK-NEXT: [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP187]], <2 x i32> [[TMP184]], <2 x i32> +; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP192]], [[TMP191]] +; CHECK-NEXT: [[TMP194:%.*]] = sub <2 x i32> [[TMP184]], [[TMP187]] +; CHECK-NEXT: [[TMP195:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 +; CHECK-NEXT: [[TMP196:%.*]] = shufflevector <2 x i32> [[TMP195]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 +; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP199:%.*]] = add <2 x i32> [[TMP196]], [[TMP198]] +; CHECK-NEXT: [[TMP200:%.*]] = sub <2 x i32> [[TMP196]], [[TMP198]] +; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> +; CHECK-NEXT: [[TMP202:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0 +; CHECK-NEXT: [[TMP203:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1 +; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP202]], [[TMP203]] +; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP203]], [[TMP202]] +; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]] +; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] +; CHECK-NEXT: [[TMP204:%.*]] = add <2 x i32> [[TMP149]], [[TMP201]] +; CHECK-NEXT: [[TMP205:%.*]] = xor <2 x i32> [[TMP204]], [[TMP110]] +; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP106]], 15 +; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 +; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 +; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] +; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP106]] +; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] +; CHECK-NEXT: [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 0 +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP206]] +; CHECK-NEXT: [[TMP207:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1 +; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP207]] ; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] -; CHECK-NEXT: [[TMP283:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0 -; CHECK-NEXT: [[TMP284:%.*]] = shufflevector <2 x i32> [[TMP283]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP285:%.*]] = add <2 x i32> [[TMP284]], [[TMP194]] -; CHECK-NEXT: [[TMP286:%.*]] = sub <2 x i32> [[TMP284]], [[TMP194]] -; CHECK-NEXT: [[TMP287:%.*]] = shufflevector <2 x i32> [[TMP285]], <2 x i32> [[TMP286]], <2 x i32> -; CHECK-NEXT: [[TMP288:%.*]] = add <2 x i32> [[TMP110]], [[TMP287]] -; CHECK-NEXT: [[TMP289:%.*]] = sub <2 x i32> [[TMP287]], [[TMP110]] -; CHECK-NEXT: [[TMP290:%.*]] = add <2 x i32> [[TMP144]], [[TMP288]] -; CHECK-NEXT: [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP113]] -; CHECK-NEXT: [[TMP292:%.*]] = lshr <2 x i32> [[TMP103]], -; CHECK-NEXT: [[TMP293:%.*]] = and <2 x i32> [[TMP292]], -; CHECK-NEXT: [[TMP294:%.*]] = mul <2 x i32> [[TMP293]], -; CHECK-NEXT: [[TMP295:%.*]] = add <2 x i32> [[TMP294]], [[TMP289]] -; CHECK-NEXT: [[TMP296:%.*]] = xor <2 x i32> [[TMP295]], [[TMP103]] -; CHECK-NEXT: [[TMP297:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1 -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[TMP297]], [[ADD113_2]] -; CHECK-NEXT: [[TMP298:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0 -; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP298]] -; CHECK-NEXT: [[TMP299:%.*]] = extractelement <2 x i32> [[TMP296]], i32 0 -; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP299]] -; CHECK-NEXT: [[TMP300:%.*]] = extractelement <2 x i32> [[TMP296]], i32 1 -; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP300]] +; CHECK-NEXT: [[TMP208:%.*]] = insertelement <2 x i32> [[TMP150]], i32 [[SUB45_2]], i32 0 +; CHECK-NEXT: [[TMP209:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB47_2]], i32 0 +; CHECK-NEXT: [[TMP210:%.*]] = sub <2 x i32> [[TMP208]], [[TMP209]] +; CHECK-NEXT: [[TMP211:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> +; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> [[TMP144]], <2 x i32> +; CHECK-NEXT: [[TMP213:%.*]] = sub <2 x i32> [[TMP211]], [[TMP212]] +; CHECK-NEXT: [[TMP214:%.*]] = extractelement <2 x i32> [[TMP210]], i32 0 +; CHECK-NEXT: [[TMP215:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0 +; CHECK-NEXT: [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP210]], <2 x i32> +; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP215]], [[TMP214]] +; CHECK-NEXT: [[TMP217:%.*]] = extractelement <2 x i32> [[TMP210]], i32 1 +; CHECK-NEXT: [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1 +; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP210]], <2 x i32> +; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP218]], [[TMP217]] +; CHECK-NEXT: [[TMP220:%.*]] = sub <2 x i32> [[TMP210]], [[TMP213]] +; CHECK-NEXT: [[TMP221:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 +; CHECK-NEXT: [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP221]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP223:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 +; CHECK-NEXT: [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP225:%.*]] = add <2 x i32> [[TMP222]], [[TMP224]] +; CHECK-NEXT: [[TMP226:%.*]] = sub <2 x i32> [[TMP222]], [[TMP224]] +; CHECK-NEXT: [[TMP227:%.*]] = shufflevector <2 x i32> [[TMP225]], <2 x i32> [[TMP226]], <2 x i32> +; CHECK-NEXT: [[TMP228:%.*]] = extractelement <2 x i32> [[TMP220]], i32 0 +; CHECK-NEXT: [[TMP229:%.*]] = extractelement <2 x i32> [[TMP220]], i32 1 +; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[TMP228]], [[TMP229]] +; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[TMP229]], [[TMP228]] +; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]] +; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]] +; CHECK-NEXT: [[TMP230:%.*]] = lshr <2 x i32> [[TMP79]], +; CHECK-NEXT: [[TMP231:%.*]] = and <2 x i32> [[TMP230]], +; CHECK-NEXT: [[TMP232:%.*]] = mul <2 x i32> [[TMP231]], +; CHECK-NEXT: [[TMP233:%.*]] = add <2 x i32> [[TMP232]], [[TMP227]] +; CHECK-NEXT: [[TMP234:%.*]] = xor <2 x i32> [[TMP233]], [[TMP79]] +; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 +; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 +; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 +; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] +; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] +; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]] +; CHECK-NEXT: [[TMP235:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0 +; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP235]] +; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1 +; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP236]] +; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] ; CHECK-NEXT: ret i32 [[ADD113_3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll index abf1d7abdc122..5c261d69cd53e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -23,16 +23,16 @@ define void @Test(i32) { ; FORCE_REDUCTION-NEXT: entry: ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] ; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> -; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP4]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP5]] -; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP3]] -; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP3]], 14910 -; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[VAL_43]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], +; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) +; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = and <2 x i32> [[TMP5]], [[TMP6]] +; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]] +; FORCE_REDUCTION-NEXT: [[TMP9]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <2 x i32> ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll index 10369e3aa270e..fd9528aa8df3a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -70,8 +70,10 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> ; THRESHOLD-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 ; THRESHOLD-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; THRESHOLD-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; THRESHOLD-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; THRESHOLD-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 +; THRESHOLD-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; THRESHOLD-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; THRESHOLD-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] ; THRESHOLD-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll index 9376bcd220a2c..18d5b09001762 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -104,8 +104,10 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> ; THRESHOLD-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3 ; THRESHOLD-NEXT: [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0 ; THRESHOLD-NEXT: [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1 +; THRESHOLD-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; THRESHOLD-NEXT: [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2 ; THRESHOLD-NEXT: [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3 +; THRESHOLD-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> ; THRESHOLD-NEXT: [[Q4:%.*]] = fadd float [[Q0]], [[Q1]] ; THRESHOLD-NEXT: [[Q5:%.*]] = fadd float [[Q2]], [[Q3]] ; THRESHOLD-NEXT: [[Q6:%.*]] = fadd float [[Q4]], [[Q5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll index ec90ca9bc674d..2cdbd5cff4468 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64 -S | FileCheck %s --check-prefixes=SSE2 ; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v2 -S | FileCheck %s --check-prefixes=SSE42 -; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX -; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX +; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v3 -S | FileCheck %s --check-prefixes=AVX2 +; RUN: opt < %s -passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v4 -S | FileCheck %s --check-prefixes=AVX512 ; PR51746 ; typedef int v4si __attribute__ ((vector_size (16))); @@ -18,33 +18,44 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4( ; SSE2-NEXT: entry: -; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; SSE2-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) -; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] -; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] -; SSE2-NEXT: ret i32 [[OP_RDX1]] +; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <16 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP3]], <16 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP5]], [[ACC:%.*]] +; SSE2-NEXT: ret i32 [[OP_RDX]] ; ; SSE42-LABEL: @reduce_and4( ; SSE42-NEXT: entry: -; SSE42-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; SSE42-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) -; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; SSE42-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) -; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] -; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] -; SSE42-NEXT: ret i32 [[OP_RDX1]] +; SSE42-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> +; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> +; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> [[TMP1]], <16 x i32> +; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> +; SSE42-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[TMP3]], <16 x i32> +; SSE42-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]]) +; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP5]], [[ACC:%.*]] +; SSE42-NEXT: ret i32 [[OP_RDX]] ; -; AVX-LABEL: @reduce_and4( -; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] -; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] -; AVX-NEXT: ret i32 [[OP_RDX1]] +; AVX2-LABEL: @reduce_and4( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> [[TMP1]], <16 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP2]]) +; AVX2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[ACC:%.*]] +; AVX2-NEXT: ret i32 [[OP_RDX]] +; +; AVX512-LABEL: @reduce_and4( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] +; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] +; AVX512-NEXT: ret i32 [[OP_RDX1]] ; entry: %vecext = extractelement <4 x i32> %v1, i64 0 @@ -92,31 +103,41 @@ entry: define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4_transpose( -; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]] -; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] -; SSE2-NEXT: ret i32 [[OP_RDX1]] +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> +; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> [[TMP4]], <16 x i32> +; SSE2-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP5]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[ACC:%.*]] +; SSE2-NEXT: ret i32 [[OP_RDX]] ; ; SSE42-LABEL: @reduce_and4_transpose( -; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; SSE42-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; SSE42-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]] -; SSE42-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] -; SSE42-NEXT: ret i32 [[OP_RDX1]] +; SSE42-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <16 x i32> +; SSE42-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <16 x i32> +; SSE42-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> +; SSE42-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> poison, <16 x i32> +; SSE42-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> [[TMP4]], <16 x i32> +; SSE42-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP5]]) +; SSE42-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[ACC:%.*]] +; SSE42-NEXT: ret i32 [[OP_RDX]] +; +; AVX2-LABEL: @reduce_and4_transpose( +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]]) +; AVX2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[ACC:%.*]] +; AVX2-NEXT: ret i32 [[OP_RDX]] ; -; AVX-LABEL: @reduce_and4_transpose( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]] -; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] -; AVX-NEXT: ret i32 [[OP_RDX1]] +; AVX512-LABEL: @reduce_and4_transpose( +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) +; AVX512-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]] +; AVX512-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] +; AVX512-NEXT: ret i32 [[OP_RDX1]] ; %vecext = extractelement <4 x i32> %v1, i64 0 %vecext1 = extractelement <4 x i32> %v2, i64 0