diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index a0be6677d04446..db39fa44c0da7d 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -216,6 +216,8 @@ namespace { bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); + bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM, + unsigned Depth); bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, @@ -2468,10 +2470,18 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { return false; } -/// Helper for selectVectorAddr. Handles things that can be folded into a -/// gather scatter address. The index register and scale should have already -/// been handled. -bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { +bool X86DAGToDAGISel::matchVectorAddressRecursively(SDValue N, + X86ISelAddressMode &AM, + unsigned Depth) { + SDLoc dl(N); + LLVM_DEBUG({ + dbgs() << "MatchVectorAddress: "; + AM.dump(CurDAG); + }); + // Limit recursion. + if (Depth > 5) + return matchAddressBase(N, AM); + // TODO: Support other operations. switch (N.getOpcode()) { case ISD::Constant: { @@ -2484,11 +2494,40 @@ bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { if (!matchWrapper(N, AM)) return false; break; + case ISD::ADD: { + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + X86ISelAddressMode Backup = AM; + if (!matchVectorAddressRecursively(N.getOperand(0), AM, Depth + 1) && + !matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM, + Depth + 1)) + return false; + AM = Backup; + + // Try again after commuting the operands. + if (!matchVectorAddressRecursively(Handle.getValue().getOperand(1), AM, + Depth + 1) && + !matchVectorAddressRecursively(Handle.getValue().getOperand(0), AM, + Depth + 1)) + return false; + AM = Backup; + + N = Handle.getValue(); + } } return matchAddressBase(N, AM); } +/// Helper for selectVectorAddr. Handles things that can be folded into a +/// gather/scatter address. The index register and scale should have already +/// been handled. +bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { + return matchVectorAddressRecursively(N, AM, 0); +} + bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, SDValue ScaleOp, SDValue &Base, SDValue &Scale, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a82c200bcacfcd..0ae7bad2bceecb 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -50283,6 +50283,48 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, } } + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); + // Try to move splat constant adders from the index operand to the base + // pointer operand. Taking care to multiply by the scale. We can only do + // this when index element type is the same as the pointer type. + // Otherwise we need to be sure the math doesn't wrap before the scale. + if (Index.getOpcode() == ISD::ADD && + Index.getValueType().getVectorElementType() == PtrVT && + isa(Scale)) { + uint64_t ScaleAmt = cast(Scale)->getZExtValue(); + if (auto *BV = dyn_cast(Index.getOperand(1))) { + BitVector UndefElts; + if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) { + // FIXME: Allow non-constant? + if (UndefElts.none()) { + // Apply the scale. + APInt Adder = C->getAPIntValue() * ScaleAmt; + // Add it to the existing base. + Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base, + DAG.getConstant(Adder, DL, PtrVT)); + Index = Index.getOperand(0); + return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); + } + } + + // It's also possible base is just a constant. In that case, just + // replace it with 0 and move the displacement into the index. + if (BV->isConstant() && isa(Base) && + isOneConstant(Scale)) { + SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base); + // Combine the constant build_vector and the constant base. + Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(), + Index.getOperand(1), Splat); + // Add to the LHS of the original Index add. + Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(), + Index.getOperand(0), Splat); + Base = DAG.getConstant(0, DL, Base.getValueType()); + return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); + } + } + } + if (DCI.isBeforeLegalizeOps()) { unsigned IndexWidth = Index.getScalarValueSizeInBits(); diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index c54fd4b4e736e8..12d545099d2162 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -476,10 +476,9 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 -; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 +; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test9: @@ -491,12 +490,10 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820] ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68] -; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1 ; KNL_32-NEXT: movw $255, %ax ; KNL_32-NEXT: kmovw %eax, %k1 -; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1} +; KNL_32-NEXT: vpgatherdd 68(,%zmm1), %zmm0 {%k1} ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; KNL_32-NEXT: retl ; @@ -507,10 +504,9 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 +; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm1 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 -; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; SKX_SMALL-NEXT: retq ; ; SKX_LARGE-LABEL: test9: @@ -522,11 +518,9 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1 +; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm1 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 -; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: test9: @@ -535,10 +529,9 @@ define <8 x i32> @test9(%struct.ST* %base, <8 x i64> %ind1, <8 x i32>%ind5) { ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 -; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1 +; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1} +; SKX_32-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} ; SKX_32-NEXT: retl entry: %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 @@ -562,10 +555,9 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm0 -; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 +; KNL_64-NEXT: vpaddq %zmm0, %zmm4, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test10: @@ -577,12 +569,10 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820] ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [68,68,68,68,68,68,68,68] -; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1 ; KNL_32-NEXT: movw $255, %ax ; KNL_32-NEXT: kmovw %eax, %k1 -; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1} +; KNL_32-NEXT: vpgatherdd 68(,%zmm1), %zmm0 {%k1} ; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; KNL_32-NEXT: retl ; @@ -593,10 +583,9 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 ; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 +; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm1 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 -; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; SKX_SMALL-NEXT: retq ; ; SKX_LARGE-LABEL: test10: @@ -608,11 +597,9 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 ; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 -; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1 +; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm1 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 -; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} ; SKX_LARGE-NEXT: retq ; ; SKX_32-LABEL: test10: @@ -621,10 +608,9 @@ define <8 x i32> @test10(%struct.ST* %base, <8 x i64> %i1, <8 x i32>%ind5) { ; SKX_32-NEXT: vpmovqd %zmm0, %ymm0 ; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 -; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1 +; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1} +; SKX_32-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} ; SKX_32-NEXT: retl entry: %broadcast.splatinsert = insertelement <8 x %struct.ST*> undef, %struct.ST* %base, i32 0 @@ -5125,39 +5111,30 @@ declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32 i define <8 x i64> @pr45906(<8 x %struct.foo*> %ptr) { ; KNL_64-LABEL: pr45906: ; KNL_64: # %bb.0: # %bb -; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1} +; KNL_64-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} +; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: pr45906: ; KNL_32: # %bb.0: # %bb -; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vpgatherdq (,%ymm1), %zmm0 {%k1} +; KNL_32-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_32-NEXT: retl ; -; SKX_SMALL-LABEL: pr45906: -; SKX_SMALL: # %bb.0: # %bb -; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 -; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 -; SKX_SMALL-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1} -; SKX_SMALL-NEXT: retq -; -; SKX_LARGE-LABEL: pr45906: -; SKX_LARGE: # %bb.0: # %bb -; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; SKX_LARGE-NEXT: vpaddq (%rax){1to8}, %zmm0, %zmm1 -; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 -; SKX_LARGE-NEXT: vpgatherqq (,%zmm1), %zmm0 {%k1} -; SKX_LARGE-NEXT: retq +; SKX-LABEL: pr45906: +; SKX: # %bb.0: # %bb +; SKX-NEXT: kxnorw %k0, %k0, %k1 +; SKX-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq ; ; SKX_32-LABEL: pr45906: ; SKX_32: # %bb.0: # %bb -; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vpgatherdq (,%ymm1), %zmm0 {%k1} +; SKX_32-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} +; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX_32-NEXT: retl bb: %tmp = getelementptr inbounds %struct.foo, <8 x %struct.foo*> %ptr, i64 0, i32 1 @@ -5165,3 +5142,69 @@ bb: ret <8 x i64> %tmp1 } declare <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>) + +%struct.ST2 = type { i32, i32 } + +; Make sure we don't use a displacement on the gather. The constant from the +; struct offset should be folded into the constant pool load for the vector +; add. +define <8 x i32> @test_const_fold(%struct.ST2* %base, <8 x i64> %i1) { +; KNL_64-LABEL: test_const_fold: +; KNL_64: # %bb.0: # %entry +; KNL_64-NEXT: vpsllq $3, %zmm0, %zmm0 +; KNL_64-NEXT: vpbroadcastq %rdi, %zmm1 +; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; KNL_64-NEXT: kxnorw %k0, %k0, %k1 +; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; KNL_64-NEXT: retq +; +; KNL_32-LABEL: test_const_fold: +; KNL_32: # %bb.0: # %entry +; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 +; KNL_32-NEXT: vpslld $3, %ymm0, %ymm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm1 +; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 +; KNL_32-NEXT: movw $255, %ax +; KNL_32-NEXT: kmovw %eax, %k1 +; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm0 {%k1} +; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; KNL_32-NEXT: retl +; +; SKX_SMALL-LABEL: test_const_fold: +; SKX_SMALL: # %bb.0: # %entry +; SKX_SMALL-NEXT: vpsllq $3, %zmm0, %zmm0 +; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm1 +; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 +; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_SMALL-NEXT: retq +; +; SKX_LARGE-LABEL: test_const_fold: +; SKX_LARGE: # %bb.0: # %entry +; SKX_LARGE-NEXT: vpsllq $3, %zmm0, %zmm0 +; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm1 +; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax +; SKX_LARGE-NEXT: vpaddq (%rax), %zmm0, %zmm1 +; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT: vpgatherqd (,%zmm1), %ymm0 {%k1} +; SKX_LARGE-NEXT: retq +; +; SKX_32-LABEL: test_const_fold: +; SKX_32: # %bb.0: # %entry +; SKX_32-NEXT: vpmovqd %zmm0, %ymm0 +; SKX_32-NEXT: vpslld $3, %ymm0, %ymm0 +; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 +; SKX_32-NEXT: kxnorw %k0, %k0, %k1 +; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 +; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm0 {%k1} +; SKX_32-NEXT: retl +entry: + %add = add <8 x i64> %i1, + %arrayidx = getelementptr %struct.ST2, %struct.ST2* %base, <8 x i64> %add, i32 1 + %res = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %arrayidx, i32 4, <8 x i1> , <8 x i32> undef) + ret <8 x i32> %res +}