diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e65a4f9eb1ec..0f71dbda120f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16432,21 +16432,12 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1); if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() && X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) { + MVT MemVT = VT.getHalfNumVectorElementsVT(); + unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize(); auto *Ld = cast(peekThroughOneUseBitcasts(V1)); - if (!Ld->isNonTemporal()) { - MVT MemVT = VT.getHalfNumVectorElementsVT(); - unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize(); - SDVTList Tys = DAG.getVTList(VT, MVT::Other); - SDValue Ptr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), - TypeSize::Fixed(Ofs), DL); - SDValue Ops[] = {Ld->getChain(), Ptr}; - SDValue BcastLd = DAG.getMemIntrinsicNode( - X86ISD::SUBV_BROADCAST_LOAD, DL, Tys, Ops, MemVT, - DAG.getMachineFunction().getMachineMemOperand( - Ld->getMemOperand(), Ofs, MemVT.getStoreSize())); - DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), BcastLd.getValue(1)); - return BcastLd; - } + if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL, + VT, MemVT, Ld, Ofs, DAG)) + return BcstLd; } // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.