diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1fe39ca1a485b..c0b7298f78005 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -7089,6 +7089,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { }))) && !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && !isSplat(Gathers)) { + InstructionCost BaseCost = R.getGatherCost(Gathers, !Root); SetVector VectorizedLoads; SmallVector VectorizedStarts; SmallVector ScatterVectorized; @@ -7170,14 +7171,46 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, LI->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo(), LI); + // Estimate GEP cost. + SmallVector PointerOps(VF); + for (auto [I, V] : enumerate(VL.slice(P, VF))) + PointerOps[I] = cast(V)->getPointerOperand(); + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, LI->getPointerOperand(), + Instruction::Load, CostKind, LI->getType(), LoadTy); + GatherCost += VectorGEPCost - ScalarGEPCost; } for (unsigned P : ScatterVectorized) { auto *LI0 = cast(VL[P]); - Align CommonAlignment = - computeCommonAlignment(VL.slice(P, VF)); + ArrayRef Slice = VL.slice(P, VF); + Align CommonAlignment = computeCommonAlignment(Slice); GatherCost += TTI.getGatherScatterOpCost( Instruction::Load, LoadTy, LI0->getPointerOperand(), /*VariableMask=*/false, CommonAlignment, CostKind, LI0); + // Estimate GEP cost. + SmallVector PointerOps(VF); + for (auto [I, V] : enumerate(Slice)) + PointerOps[I] = cast(V)->getPointerOperand(); + OrdersType Order; + if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE, + Order)) { + // TODO: improve checks if GEPs can be vectorized. + Value *Ptr0 = PointerOps.front(); + Type *ScalarTy = Ptr0->getType(); + auto *VecTy = FixedVectorType::get(ScalarTy, VF); + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr, + CostKind, ScalarTy, VecTy); + GatherCost += VectorGEPCost - ScalarGEPCost; + if (!Order.empty()) { + SmallVector Mask; + inversePermutation(Order, Mask); + GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, + VecTy, Mask, CostKind); + } + } else { + GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true); + } } if (NeedInsertSubvectorAnalysis) { // Add the cost for the subvectors insert. @@ -7187,6 +7220,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { } GatherCost -= ScalarsCost; } + GatherCost = std::min(BaseCost, GatherCost); } else if (!Root && isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as // the broadcast. diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index ccc31193c7215..dc5fb91788634 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -5,99 +5,368 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x ptr> poison, ptr [[PIX2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x ptr> [[TMP3]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, <2 x ptr> [[TMP4]], <2 x i64> +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr i8, ptr [[PIX1]], i64 2 +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 2 ; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[PIX1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> [[TMP4]], ptr [[ADD_PTR3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x ptr> [[TMP5]], <8 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, <8 x ptr> [[TMP6]], <8 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP7]], i32 1, <8 x i1> , <8 x i8> poison) -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x ptr> poison, ptr [[PIX2]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x ptr> [[TMP9]], ptr [[ADD_PTR644]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x ptr> [[TMP10]], <8 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, <8 x ptr> [[TMP11]], <8 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> [[TMP12]], i32 1, <8 x i1> , <8 x i8> poison) +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP9]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 +; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1 +; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP10]] to i32 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR_1]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x ptr> [[TMP11]], ptr [[ARRAYIDX20_2]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP12]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP14:%.*]] = zext <2 x i8> [[TMP13]] to <2 x i32> +; CHECK-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR64_1]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x ptr> [[TMP15]], ptr [[ARRAYIDX22_2]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP16]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = sub <2 x i32> [[TMP14]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x ptr> [[TMP12]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, <2 x ptr> [[TMP20]], <2 x i64> +; CHECK-NEXT: [[TMP22:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP21]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x ptr> [[TMP16]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, <2 x ptr> [[TMP24]], <2 x i64> +; CHECK-NEXT: [[TMP26:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP25]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = sub <2 x i32> [[TMP23]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], +; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP29]], [[TMP19]] +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr i8, <2 x ptr> [[TMP20]], <2 x i64> +; CHECK-NEXT: [[TMP32:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP31]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP33:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i8, <2 x ptr> [[TMP24]], <2 x i64> +; CHECK-NEXT: [[TMP35:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP34]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP36:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = sub <2 x i32> [[TMP33]], [[TMP36]] +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i8, <2 x ptr> [[TMP20]], <2 x i64> +; CHECK-NEXT: [[TMP39:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP38]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP40:%.*]] = zext <2 x i8> [[TMP39]] to <2 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = getelementptr i8, <2 x ptr> [[TMP24]], <2 x i64> +; CHECK-NEXT: [[TMP42:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP41]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP43:%.*]] = zext <2 x i8> [[TMP42]] to <2 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP40]], [[TMP43]] +; CHECK-NEXT: [[TMP45:%.*]] = shl <2 x i32> [[TMP44]], +; CHECK-NEXT: [[TMP46:%.*]] = add <2 x i32> [[TMP45]], [[TMP37]] +; CHECK-NEXT: [[TMP47:%.*]] = sub <2 x i32> [[TMP30]], [[TMP46]] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1 +; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP48]], [[TMP49]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX3_3]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP18]], i32 1, <2 x i1> , <2 x i8> poison) -; CHECK-NEXT: [[TMP20:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP21:%.*]] = load <4 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP22:%.*]] = load <4 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP23:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP24:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP21]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i8> [[TMP25]], <16 x i8> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <16 x i8> [[TMP27]], <16 x i8> [[TMP28]], <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = zext <16 x i8> [[TMP31]] to <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP22]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <16 x i8> [[TMP33]], <16 x i8> [[TMP34]], <16 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub <16 x i32> [[TMP32]], [[TMP40]] -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i8> poison, i8 [[TMP23]], i32 0 -; CHECK-NEXT: [[TMP43:%.*]] = insertelement <16 x i8> [[TMP42]], i8 [[TMP20]], i32 1 -; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <2 x i8> [[TMP19]], <2 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i8> [[TMP43]], <16 x i8> [[TMP44]], <16 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP47:%.*]] = shufflevector <16 x i8> [[TMP45]], <16 x i8> [[TMP46]], <16 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <16 x i8> [[TMP47]], <16 x i8> [[TMP48]], <16 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = zext <16 x i8> [[TMP49]] to <16 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP50]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <4 x i8> [[TMP24]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <4 x i8> [[TMP17]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = shufflevector <16 x i8> [[TMP52]], <16 x i8> [[TMP53]], <16 x i32> -; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i8> [[TMP54]], <16 x i8> [[TMP55]], <16 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = zext <16 x i8> [[TMP56]] to <16 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = sub <16 x i32> [[TMP51]], [[TMP57]] -; CHECK-NEXT: [[TMP59:%.*]] = shl <16 x i32> [[TMP58]], -; CHECK-NEXT: [[TMP60:%.*]] = add <16 x i32> [[TMP59]], [[TMP41]] -; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = add <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP63:%.*]] = sub <16 x i32> [[TMP60]], [[TMP61]] -; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP62]], <16 x i32> [[TMP63]], <16 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = add <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP67:%.*]] = sub <16 x i32> [[TMP64]], [[TMP65]] -; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = shufflevector <16 x i32> [[TMP68]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP70:%.*]] = add <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP71:%.*]] = sub <16 x i32> [[TMP68]], [[TMP69]] -; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <16 x i32> [[TMP70]], <16 x i32> [[TMP71]], <16 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP74:%.*]] = add <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP75:%.*]] = sub <16 x i32> [[TMP72]], [[TMP73]] -; CHECK-NEXT: [[TMP76:%.*]] = shufflevector <16 x i32> [[TMP74]], <16 x i32> [[TMP75]], <16 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = shufflevector <16 x i32> [[TMP32]], <16 x i32> [[TMP64]], <16 x i32> -; CHECK-NEXT: [[TMP78:%.*]] = lshr <16 x i32> [[TMP77]], -; CHECK-NEXT: [[TMP79:%.*]] = and <16 x i32> [[TMP78]], -; CHECK-NEXT: [[TMP80:%.*]] = mul <16 x i32> [[TMP79]], -; CHECK-NEXT: [[TMP81:%.*]] = add <16 x i32> [[TMP80]], [[TMP76]] -; CHECK-NEXT: [[TMP82:%.*]] = xor <16 x i32> [[TMP81]], [[TMP77]] -; CHECK-NEXT: [[TMP83:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP82]]) -; CHECK-NEXT: ret i32 [[TMP83]] +; CHECK-NEXT: [[TMP50:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 +; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 +; CHECK-NEXT: [[TMP51:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX20_3]], i32 1 +; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP52]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX22_3]], i32 1 +; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP55]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = sub <2 x i32> [[TMP54]], [[TMP57]] +; CHECK-NEXT: [[TMP59:%.*]] = insertelement <2 x ptr> , ptr [[ARRAYIDX3_3]], i32 0 +; CHECK-NEXT: [[TMP60:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP59]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]] +; CHECK-NEXT: [[TMP65:%.*]] = shl <2 x i32> [[TMP64]], +; CHECK-NEXT: [[TMP66:%.*]] = add <2 x i32> [[TMP65]], [[TMP58]] +; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = sub <2 x i32> [[TMP68]], [[TMP70]] +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i8> poison, i8 [[TMP50]], i32 0 +; CHECK-NEXT: [[TMP73:%.*]] = insertelement <2 x i8> [[TMP72]], i8 [[TMP51]], i32 1 +; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i64> ), i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32> +; CHECK-NEXT: [[TMP77:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]] +; CHECK-NEXT: [[TMP78:%.*]] = shl <2 x i32> [[TMP77]], +; CHECK-NEXT: [[TMP79:%.*]] = add <2 x i32> [[TMP78]], [[TMP71]] +; CHECK-NEXT: [[TMP80:%.*]] = sub <2 x i32> [[TMP66]], [[TMP79]] +; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> +; CHECK-NEXT: [[TMP82:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> +; CHECK-NEXT: [[TMP83:%.*]] = add <2 x i32> [[TMP81]], [[TMP82]] +; CHECK-NEXT: [[TMP84:%.*]] = shufflevector <2 x i32> [[TMP79]], <2 x i32> [[TMP46]], <2 x i32> +; CHECK-NEXT: [[TMP85:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP30]], <2 x i32> +; CHECK-NEXT: [[TMP86:%.*]] = add <2 x i32> [[TMP84]], [[TMP85]] +; CHECK-NEXT: [[TMP87:%.*]] = add <2 x i32> [[TMP86]], [[TMP83]] +; CHECK-NEXT: [[TMP88:%.*]] = sub <2 x i32> [[TMP83]], [[TMP86]] +; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP80]], i32 0 +; CHECK-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP80]], i32 1 +; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP89]], [[TMP90]] +; CHECK-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP87]], i32 0 +; CHECK-NEXT: [[TMP92:%.*]] = extractelement <2 x i32> [[TMP87]], i32 1 +; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP91]], [[TMP92]] +; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[TMP92]], [[TMP91]] +; CHECK-NEXT: [[TMP93:%.*]] = extractelement <2 x i32> [[TMP54]], i32 0 +; CHECK-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP93]], 15 +; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 +; CHECK-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 +; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP86]], i32 1 +; CHECK-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP94]], 15 +; CHECK-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 +; CHECK-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 +; CHECK-NEXT: [[TMP95:%.*]] = extractelement <2 x i32> [[TMP88]], i32 0 +; CHECK-NEXT: [[TMP96:%.*]] = extractelement <2 x i32> [[TMP88]], i32 1 +; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP95]], [[TMP96]] +; CHECK-NEXT: [[TMP97:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20]], align 1 +; CHECK-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP97]] to <2 x i32> +; CHECK-NEXT: [[TMP99:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_2]], i32 0 +; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <2 x i32> [[TMP99]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP101:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_3]], i32 0 +; CHECK-NEXT: [[TMP102:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP103:%.*]] = add <2 x i32> [[TMP100]], [[TMP102]] +; CHECK-NEXT: [[TMP104:%.*]] = sub <2 x i32> [[TMP100]], [[TMP102]] +; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP103]], <2 x i32> [[TMP104]], <2 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = load <2 x i8>, ptr [[PIX1]], align 1 +; CHECK-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> +; CHECK-NEXT: [[TMP108:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <2 x ptr> [[TMP4]], ptr [[ARRAYIDX22]], i32 1 +; CHECK-NEXT: [[TMP110:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP109]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32> +; CHECK-NEXT: [[TMP112:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP2]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32> +; CHECK-NEXT: [[TMP114:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP5]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32> +; CHECK-NEXT: [[TMP116:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]] +; CHECK-NEXT: [[TMP117:%.*]] = shl <2 x i32> [[TMP116]], +; CHECK-NEXT: [[TMP118:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP6]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32> +; CHECK-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP7]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP121:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32> +; CHECK-NEXT: [[TMP122:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP8]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32> +; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP121]], [[TMP123]] +; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], +; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP107]], <2 x i32> [[TMP98]], <2 x i32> +; CHECK-NEXT: [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP111]] +; CHECK-NEXT: [[TMP128:%.*]] = add <2 x i32> [[TMP117]], [[TMP127]] +; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP108]], <2 x i32> [[TMP98]], <2 x i32> +; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP119]] +; CHECK-NEXT: [[TMP131:%.*]] = add <2 x i32> [[TMP125]], [[TMP130]] +; CHECK-NEXT: [[TMP132:%.*]] = extractelement <2 x i32> [[TMP128]], i32 1 +; CHECK-NEXT: [[TMP133:%.*]] = extractelement <2 x i32> [[TMP131]], i32 1 +; CHECK-NEXT: [[ADD46:%.*]] = add i32 [[TMP133]], [[TMP132]] +; CHECK-NEXT: [[TMP134:%.*]] = sub <2 x i32> [[TMP128]], [[TMP131]] +; CHECK-NEXT: [[TMP135:%.*]] = extractelement <2 x i32> [[TMP128]], i32 0 +; CHECK-NEXT: [[TMP136:%.*]] = extractelement <2 x i32> [[TMP131]], i32 0 +; CHECK-NEXT: [[ADD44:%.*]] = add i32 [[TMP136]], [[TMP135]] +; CHECK-NEXT: [[TMP137:%.*]] = lshr <2 x i32> [[TMP108]], +; CHECK-NEXT: [[TMP138:%.*]] = and <2 x i32> [[TMP137]], +; CHECK-NEXT: [[TMP139:%.*]] = mul <2 x i32> [[TMP138]], +; CHECK-NEXT: [[TMP140:%.*]] = extractelement <2 x i32> [[TMP134]], i32 0 +; CHECK-NEXT: [[TMP141:%.*]] = extractelement <2 x i32> [[TMP134]], i32 1 +; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP140]], [[TMP141]] +; CHECK-NEXT: [[TMP142:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 +; CHECK-NEXT: [[TMP143:%.*]] = zext <2 x i8> [[TMP142]] to <2 x i32> +; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2 +; CHECK-NEXT: [[TMP144:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR644]], i32 0 +; CHECK-NEXT: [[TMP145:%.*]] = insertelement <2 x ptr> [[TMP144]], ptr [[ARRAYIDX22_1]], i32 1 +; CHECK-NEXT: [[TMP146:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP145]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP147:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32> +; CHECK-NEXT: [[TMP148:%.*]] = insertelement <2 x ptr> poison, ptr [[ADD_PTR3]], i32 0 +; CHECK-NEXT: [[TMP149:%.*]] = shufflevector <2 x ptr> [[TMP148]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP150:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> +; CHECK-NEXT: [[TMP151:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP150]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP152:%.*]] = zext <2 x i8> [[TMP151]] to <2 x i32> +; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x ptr> [[TMP145]], <2 x ptr> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP154:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> +; CHECK-NEXT: [[TMP155:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP154]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP156:%.*]] = zext <2 x i8> [[TMP155]] to <2 x i32> +; CHECK-NEXT: [[TMP157:%.*]] = sub <2 x i32> [[TMP152]], [[TMP156]] +; CHECK-NEXT: [[TMP158:%.*]] = shl <2 x i32> [[TMP157]], +; CHECK-NEXT: [[TMP159:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> +; CHECK-NEXT: [[TMP160:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP159]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32> +; CHECK-NEXT: [[TMP162:%.*]] = getelementptr i8, <2 x ptr> [[TMP149]], <2 x i64> +; CHECK-NEXT: [[TMP163:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP162]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32> +; CHECK-NEXT: [[TMP165:%.*]] = getelementptr i8, <2 x ptr> [[TMP153]], <2 x i64> +; CHECK-NEXT: [[TMP166:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> [[TMP165]], i32 1, <2 x i1> , <2 x i8> poison) +; CHECK-NEXT: [[TMP167:%.*]] = zext <2 x i8> [[TMP166]] to <2 x i32> +; CHECK-NEXT: [[TMP168:%.*]] = sub <2 x i32> [[TMP164]], [[TMP167]] +; CHECK-NEXT: [[TMP169:%.*]] = shl <2 x i32> [[TMP168]], +; CHECK-NEXT: [[TMP170:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV33_1]], i32 1 +; CHECK-NEXT: [[TMP171:%.*]] = sub <2 x i32> [[TMP170]], [[TMP161]] +; CHECK-NEXT: [[TMP172:%.*]] = add <2 x i32> [[TMP169]], [[TMP171]] +; CHECK-NEXT: [[TMP173:%.*]] = insertelement <2 x i32> [[TMP143]], i32 [[CONV_1]], i32 0 +; CHECK-NEXT: [[TMP174:%.*]] = sub <2 x i32> [[TMP173]], [[TMP147]] +; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP158]], [[TMP174]] +; CHECK-NEXT: [[TMP176:%.*]] = add <2 x i32> [[TMP172]], [[TMP175]] +; CHECK-NEXT: [[TMP177:%.*]] = sub <2 x i32> [[TMP175]], [[TMP172]] +; CHECK-NEXT: [[TMP178:%.*]] = extractelement <2 x i32> [[TMP176]], i32 0 +; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP176]], i32 1 +; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP178]], [[TMP179]] +; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> +; CHECK-NEXT: [[TMP181:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> [[TMP134]], <2 x i32> +; CHECK-NEXT: [[TMP182:%.*]] = add <2 x i32> [[TMP180]], [[TMP181]] +; CHECK-NEXT: [[TMP183:%.*]] = extractelement <2 x i32> [[TMP177]], i32 0 +; CHECK-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP177]], i32 1 +; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP183]], [[TMP184]] +; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP179]], 15 +; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 +; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 +; CHECK-NEXT: [[TMP185:%.*]] = lshr <2 x i32> [[TMP143]], +; CHECK-NEXT: [[TMP186:%.*]] = and <2 x i32> [[TMP185]], +; CHECK-NEXT: [[TMP187:%.*]] = mul <2 x i32> [[TMP186]], +; CHECK-NEXT: [[TMP188:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59_1]], i32 0 +; CHECK-NEXT: [[TMP189:%.*]] = shufflevector <2 x i32> [[TMP188]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP182]], i32 0 +; CHECK-NEXT: [[TMP191:%.*]] = extractelement <2 x i32> [[TMP182]], i32 1 +; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP190]], [[TMP191]] +; CHECK-NEXT: [[TMP192:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> [[TMP177]], <2 x i32> +; CHECK-NEXT: [[TMP193:%.*]] = lshr <2 x i32> [[TMP192]], +; CHECK-NEXT: [[TMP194:%.*]] = and <2 x i32> [[TMP193]], +; CHECK-NEXT: [[TMP195:%.*]] = mul <2 x i32> [[TMP194]], +; CHECK-NEXT: [[TMP196:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_1]], i32 0 +; CHECK-NEXT: [[TMP197:%.*]] = shufflevector <2 x i32> [[TMP196]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP198:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 +; CHECK-NEXT: [[TMP199:%.*]] = shufflevector <2 x i32> [[TMP198]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP200:%.*]] = insertelement <2 x i32> poison, i32 [[ADD44]], i32 0 +; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP200]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP202:%.*]] = insertelement <2 x i32> , i32 [[ADD46]], i32 1 +; CHECK-NEXT: [[TMP203:%.*]] = lshr <2 x i32> [[TMP201]], [[TMP202]] +; CHECK-NEXT: [[TMP204:%.*]] = sub <2 x i32> [[TMP201]], [[TMP202]] +; CHECK-NEXT: [[TMP205:%.*]] = shufflevector <2 x i32> [[TMP203]], <2 x i32> [[TMP204]], <2 x i32> +; CHECK-NEXT: [[TMP206:%.*]] = extractelement <2 x i32> [[TMP205]], i32 1 +; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[TMP206]] +; CHECK-NEXT: [[TMP207:%.*]] = insertelement <2 x i32> , i32 [[SUB51_1]], i32 1 +; CHECK-NEXT: [[TMP208:%.*]] = and <2 x i32> [[TMP205]], [[TMP207]] +; CHECK-NEXT: [[TMP209:%.*]] = sub <2 x i32> [[TMP205]], [[TMP207]] +; CHECK-NEXT: [[TMP210:%.*]] = shufflevector <2 x i32> [[TMP208]], <2 x i32> [[TMP209]], <2 x i32> +; CHECK-NEXT: [[TMP211:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 +; CHECK-NEXT: [[TMP212:%.*]] = shufflevector <2 x i32> [[TMP211]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP213:%.*]] = add <2 x i32> [[TMP212]], [[TMP199]] +; CHECK-NEXT: [[TMP214:%.*]] = sub <2 x i32> [[TMP212]], [[TMP199]] +; CHECK-NEXT: [[TMP215:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> [[TMP214]], <2 x i32> +; CHECK-NEXT: [[TMP216:%.*]] = insertelement <2 x i32> [[TMP134]], i32 [[CONV_1]], i32 0 +; CHECK-NEXT: [[TMP217:%.*]] = lshr <2 x i32> [[TMP216]], +; CHECK-NEXT: [[TMP218:%.*]] = and <2 x i32> [[TMP217]], +; CHECK-NEXT: [[TMP219:%.*]] = mul <2 x i32> [[TMP218]], +; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP220]], <2 x i32> [[TMP182]], <2 x i32> +; CHECK-NEXT: [[TMP222:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP182]], <2 x i32> +; CHECK-NEXT: [[TMP223:%.*]] = sub <2 x i32> [[TMP221]], [[TMP222]] +; CHECK-NEXT: [[TMP224:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP225:%.*]] = insertelement <2 x i32> [[TMP224]], i32 [[ADD46]], i32 1 +; CHECK-NEXT: [[TMP226:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[ADD44]], i32 1 +; CHECK-NEXT: [[TMP227:%.*]] = add <2 x i32> [[TMP225]], [[TMP226]] +; CHECK-NEXT: [[TMP228:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> +; CHECK-NEXT: [[TMP229:%.*]] = shufflevector <2 x i32> [[TMP80]], <2 x i32> [[TMP176]], <2 x i32> +; CHECK-NEXT: [[TMP230:%.*]] = add <2 x i32> [[TMP228]], [[TMP229]] +; CHECK-NEXT: [[TMP231:%.*]] = extractelement <2 x i32> [[TMP227]], i32 0 +; CHECK-NEXT: [[TMP232:%.*]] = extractelement <2 x i32> [[TMP230]], i32 0 +; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP232]], [[TMP231]] +; CHECK-NEXT: [[TMP233:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[ADD46]], i32 1 +; CHECK-NEXT: [[TMP234:%.*]] = lshr <2 x i32> [[TMP233]], +; CHECK-NEXT: [[TMP235:%.*]] = and <2 x i32> [[TMP234]], +; CHECK-NEXT: [[TMP236:%.*]] = mul <2 x i32> [[TMP235]], +; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP227]], i32 1 +; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP230]], i32 1 +; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[TMP238]], [[TMP237]] +; CHECK-NEXT: [[TMP239:%.*]] = sub <2 x i32> [[TMP227]], [[TMP230]] +; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] +; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] +; CHECK-NEXT: [[TMP240:%.*]] = extractelement <2 x i32> [[TMP239]], i32 1 +; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[TMP240]] +; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] +; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP93]] +; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP94]] +; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] +; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP179]] +; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] +; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] +; CHECK-NEXT: [[TMP241:%.*]] = shufflevector <2 x i32> [[TMP223]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP242:%.*]] = insertelement <2 x i32> [[TMP241]], i32 [[SUB102]], i32 1 +; CHECK-NEXT: [[TMP243:%.*]] = add <2 x i32> [[TMP239]], [[TMP242]] +; CHECK-NEXT: [[TMP244:%.*]] = sub <2 x i32> [[TMP239]], [[TMP242]] +; CHECK-NEXT: [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP243]], <2 x i32> [[TMP244]], <2 x i32> +; CHECK-NEXT: [[TMP246:%.*]] = add <2 x i32> [[TMP236]], [[TMP245]] +; CHECK-NEXT: [[TMP247:%.*]] = xor <2 x i32> [[TMP246]], [[TMP233]] +; CHECK-NEXT: [[TMP248:%.*]] = extractelement <2 x i32> [[TMP247]], i32 1 +; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[TMP248]] +; CHECK-NEXT: [[TMP249:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_1]], i32 0 +; CHECK-NEXT: [[TMP250:%.*]] = shufflevector <2 x i32> [[TMP249]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP251:%.*]] = add <2 x i32> [[TMP197]], [[TMP250]] +; CHECK-NEXT: [[TMP252:%.*]] = sub <2 x i32> [[TMP197]], [[TMP250]] +; CHECK-NEXT: [[TMP253:%.*]] = shufflevector <2 x i32> [[TMP251]], <2 x i32> [[TMP252]], <2 x i32> +; CHECK-NEXT: [[TMP254:%.*]] = add <2 x i32> [[TMP195]], [[TMP253]] +; CHECK-NEXT: [[TMP255:%.*]] = xor <2 x i32> [[TMP254]], [[TMP192]] +; CHECK-NEXT: [[TMP256:%.*]] = extractelement <2 x i32> [[TMP247]], i32 0 +; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[TMP256]], [[ADD113]] +; CHECK-NEXT: [[TMP257:%.*]] = extractelement <2 x i32> [[TMP255]], i32 0 +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP257]] +; CHECK-NEXT: [[TMP258:%.*]] = extractelement <2 x i32> [[TMP255]], i32 1 +; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP258]] +; CHECK-NEXT: [[TMP259:%.*]] = shufflevector <2 x i32> [[TMP210]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP260:%.*]] = shufflevector <2 x i32> [[TMP259]], <2 x i32> [[TMP239]], <2 x i32> +; CHECK-NEXT: [[TMP261:%.*]] = add <2 x i32> [[TMP223]], [[TMP260]] +; CHECK-NEXT: [[TMP262:%.*]] = sub <2 x i32> [[TMP223]], [[TMP260]] +; CHECK-NEXT: [[TMP263:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> +; CHECK-NEXT: [[TMP264:%.*]] = add <2 x i32> [[TMP219]], [[TMP263]] +; CHECK-NEXT: [[TMP265:%.*]] = xor <2 x i32> [[TMP264]], [[TMP216]] +; CHECK-NEXT: [[TMP266:%.*]] = extractelement <2 x i32> [[TMP265]], i32 1 +; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP266]] +; CHECK-NEXT: [[TMP267:%.*]] = shufflevector <2 x i32> , <2 x i32> [[TMP223]], <2 x i32> +; CHECK-NEXT: [[TMP268:%.*]] = mul <2 x i32> [[TMP210]], [[TMP267]] +; CHECK-NEXT: [[TMP269:%.*]] = sub <2 x i32> [[TMP210]], [[TMP267]] +; CHECK-NEXT: [[TMP270:%.*]] = shufflevector <2 x i32> [[TMP268]], <2 x i32> [[TMP269]], <2 x i32> +; CHECK-NEXT: [[TMP271:%.*]] = add <2 x i32> [[TMP187]], [[TMP215]] +; CHECK-NEXT: [[TMP272:%.*]] = xor <2 x i32> [[TMP271]], [[TMP143]] +; CHECK-NEXT: [[TMP273:%.*]] = extractelement <2 x i32> [[TMP270]], i32 0 +; CHECK-NEXT: [[TMP274:%.*]] = extractelement <2 x i32> [[TMP270]], i32 1 +; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[TMP273]], [[TMP274]] +; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[ADD44]] +; CHECK-NEXT: [[TMP275:%.*]] = extractelement <2 x i32> [[TMP265]], i32 0 +; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[TMP275]], [[ADD113_1]] +; CHECK-NEXT: [[TMP276:%.*]] = extractelement <2 x i32> [[TMP272]], i32 0 +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP276]] +; CHECK-NEXT: [[TMP277:%.*]] = extractelement <2 x i32> [[TMP272]], i32 1 +; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP277]] +; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] +; CHECK-NEXT: [[TMP278:%.*]] = insertelement <2 x i32> poison, i32 [[SUB59]], i32 0 +; CHECK-NEXT: [[TMP279:%.*]] = shufflevector <2 x i32> [[TMP278]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP280:%.*]] = add <2 x i32> [[TMP279]], [[TMP189]] +; CHECK-NEXT: [[TMP281:%.*]] = sub <2 x i32> [[TMP279]], [[TMP189]] +; CHECK-NEXT: [[TMP282:%.*]] = shufflevector <2 x i32> [[TMP280]], <2 x i32> [[TMP281]], <2 x i32> +; CHECK-NEXT: [[TMP283:%.*]] = add <2 x i32> [[TMP105]], [[TMP282]] +; CHECK-NEXT: [[TMP284:%.*]] = sub <2 x i32> [[TMP282]], [[TMP105]] +; CHECK-NEXT: [[TMP285:%.*]] = add <2 x i32> [[TMP139]], [[TMP283]] +; CHECK-NEXT: [[TMP286:%.*]] = xor <2 x i32> [[TMP285]], [[TMP108]] +; CHECK-NEXT: [[TMP287:%.*]] = lshr <2 x i32> [[TMP98]], +; CHECK-NEXT: [[TMP288:%.*]] = and <2 x i32> [[TMP287]], +; CHECK-NEXT: [[TMP289:%.*]] = mul <2 x i32> [[TMP288]], +; CHECK-NEXT: [[TMP290:%.*]] = add <2 x i32> [[TMP289]], [[TMP284]] +; CHECK-NEXT: [[TMP291:%.*]] = xor <2 x i32> [[TMP290]], [[TMP98]] +; CHECK-NEXT: [[TMP292:%.*]] = extractelement <2 x i32> [[TMP286]], i32 1 +; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[TMP292]], [[ADD113_2]] +; CHECK-NEXT: [[TMP293:%.*]] = extractelement <2 x i32> [[TMP286]], i32 0 +; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP293]] +; CHECK-NEXT: [[TMP294:%.*]] = extractelement <2 x i32> [[TMP291]], i32 0 +; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP294]] +; CHECK-NEXT: [[TMP295:%.*]] = extractelement <2 x i32> [[TMP291]], i32 1 +; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[TMP295]] +; CHECK-NEXT: ret i32 [[ADD113_3]] ; entry: %0 = load i8, ptr %pix1, align 1