diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index dce423fc1b18b..49a64e3d66f41 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1531,7 +1531,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { if (Opcode == Instruction::Store) LA = getTLI()->getTruncStoreAction(LT.second, MemVT); else - LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); + LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT, + AddressSpace); if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { // This is a vector load/store for some illegal type that is scalarized. diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 2ba8b29e775e0..2f9f6f8cec3c8 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1479,27 +1479,38 @@ class LLVM_ABI TargetLoweringBase { /// Return how this load with extension should be treated: either it is legal, /// needs to be promoted to a larger size, needs to be expanded to some other /// code sequence, or the target has a custom expander for it. - LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, - EVT MemVT) const { + LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT, + unsigned AddrSpace = 0) const { if (ValVT.isExtended() || MemVT.isExtended()) return Expand; unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy; unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy; assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::VALUETYPE_SIZE && MemI < MVT::VALUETYPE_SIZE && "Table isn't big enough!"); unsigned Shift = 4 * ExtType; - return (LegalizeAction)((LoadExtActions[ValI][MemI] >> Shift) & 0xf); + if (LoadExtActions.count(AddrSpace)) { + return ( + LegalizeAction)((LoadExtActions.at(AddrSpace)[ValI][MemI] >> Shift) & + 0xf); + } else { + assert(AddrSpace != 0 && "addrspace zero should be initialized"); + return ( + LegalizeAction)((LoadExtActions.at(0)[ValI][MemI] >> Shift) & + 0xf); + } } /// Return true if the specified load with extension is legal on this target. - bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const { - return getLoadExtAction(ExtType, ValVT, MemVT) == Legal; + bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT, + unsigned AddrSpace = 0) const { + return getLoadExtAction(ExtType, ValVT, MemVT, AddrSpace) == Legal; } /// Return true if the specified load with extension is legal or custom /// on this target. - bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const { - return getLoadExtAction(ExtType, ValVT, MemVT) == Legal || - getLoadExtAction(ExtType, ValVT, MemVT) == Custom; + bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT, + unsigned AddrSpace = 0) const { + return getLoadExtAction(ExtType, ValVT, MemVT, AddrSpace) == Legal || + getLoadExtAction(ExtType, ValVT, MemVT, AddrSpace) == Custom; } /// Same as getLoadExtAction, but for atomic loads. @@ -2641,23 +2652,27 @@ class LLVM_ABI TargetLoweringBase { /// Indicate that the specified load with extension does not work with the /// specified type and indicate what to do about it. void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, - LegalizeAction Action) { + LegalizeAction Action, unsigned AddrSpace = 0) { assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!"); assert((unsigned)Action < 0x10 && "too many bits for bitfield array"); + assert(AddrSpace == 0 && "expected addrspace 0"); unsigned Shift = 4 * ExtType; - LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] &= ~((uint16_t)0xF << Shift); - LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] |= (uint16_t)Action << Shift; + LoadExtActions[AddrSpace][ValVT.SimpleTy][MemVT.SimpleTy] &= + ~((uint16_t)0xF << Shift); + LoadExtActions[AddrSpace][ValVT.SimpleTy][MemVT.SimpleTy] |= + (uint16_t)Action << Shift; } void setLoadExtAction(ArrayRef ExtTypes, MVT ValVT, MVT MemVT, - LegalizeAction Action) { + LegalizeAction Action, unsigned AddrSpace = 0) { for (auto ExtType : ExtTypes) - setLoadExtAction(ExtType, ValVT, MemVT, Action); + setLoadExtAction(ExtType, ValVT, MemVT, Action, AddrSpace); } void setLoadExtAction(ArrayRef ExtTypes, MVT ValVT, - ArrayRef MemVTs, LegalizeAction Action) { + ArrayRef MemVTs, LegalizeAction Action, + unsigned AddrSpace = 0) { for (auto MemVT : MemVTs) - setLoadExtAction(ExtTypes, ValVT, MemVT, Action); + setLoadExtAction(ExtTypes, ValVT, MemVT, Action, AddrSpace); } /// Let target indicate that an extending atomic load of the specified type @@ -3133,7 +3148,7 @@ class LLVM_ABI TargetLoweringBase { LType = ISD::SEXTLOAD; } - return isLoadExtLegal(LType, VT, LoadVT); + return isLoadExtLegal(LType, VT, LoadVT, Load->getPointerAddressSpace()); } /// Return true if any actual instruction that defines a value of type FromTy @@ -3748,8 +3763,12 @@ class LLVM_ABI TargetLoweringBase { /// For each load extension type and each value type, keep a LegalizeAction /// that indicates how instruction selection should deal with a load of a /// specific value type and extension type. Uses 4-bits to store the action - /// for each of the 4 load ext types. - uint16_t LoadExtActions[MVT::VALUETYPE_SIZE][MVT::VALUETYPE_SIZE]; + /// for each of the 4 load ext types. These actions can be specified for each + /// address space. + using LoadExtActionMapTy = + std::array, MVT::VALUETYPE_SIZE>; + using LoadExtActionMap = std::map; + LoadExtActionMap LoadExtActions; /// Similar to LoadExtActions, but for atomic loads. Only Legal or Expand /// (default) values are supported. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 9db4c9e5e2807..e6d6c5f2aa738 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7328,7 +7328,8 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { // Reject cases that won't be matched as extloads. if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() || - !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT)) + !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT, + Load->getPointerAddressSpace())) return false; IRBuilder<> Builder(Load->getNextNode()); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d130efe96b56b..a31c2ea5ca905 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6834,7 +6834,8 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, if (ExtVT == LoadedVT && (!LegalOperations || - TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { + TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT, + LoadN->getAddressSpace()))) { // ZEXTLOAD will match without needing to change the size of the value being // loaded. return true; @@ -6850,7 +6851,8 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, return false; if (LegalOperations && - !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT)) + !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT, + LoadN->getAddressSpace())) return false; if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT, /*ByteOffset=*/0)) @@ -6913,7 +6915,8 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, return false; if (LegalOperations && - !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT)) + !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT, + Load->getAddressSpace())) return false; // For the transform to be legal, the load must produce only two values @@ -7425,7 +7428,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (MLoad && MLoad->getExtensionType() == ISD::EXTLOAD && Splat) { EVT LoadVT = MLoad->getMemoryVT(); EVT ExtVT = VT; - if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT)) { + if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT, + MLoad->getAddressSpace())) { // For this AND to be a zero extension of the masked load the elements // of the BuildVec must mask the bottom bits of the extended element // type @@ -7576,9 +7580,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) { // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is // actually legal and isn't going to get expanded, else this is a false // optimisation. - bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD, - Load->getValueType(0), - Load->getMemoryVT()); + bool CanZextLoadProfitably = + TLI.isLoadExtLegal(ISD::ZEXTLOAD, Load->getValueType(0), + Load->getMemoryVT(), Load->getAddressSpace()); // Resize the constant to the same size as the original memory access before // extension. If it is still the AllOnesValue then this AND is completely @@ -7770,7 +7774,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) { APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize); if (DAG.MaskedValueIsZero(N1, ExtBits) && ((!LegalOperations && LN0->isSimple()) || - TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { + TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT, + LN0->getAddressSpace()))) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); @@ -9692,10 +9697,13 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { // Before legalize we can introduce too wide illegal loads which will be later // split into legal sized loads. This enables us to combine i64 load by i8 // patterns to a couple of i32 loads on 32 bit targets. - if (LegalOperations && - !TLI.isLoadExtLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, VT, - MemVT)) - return SDValue(); + if (LegalOperations) { + for (auto L : Loads) { + if (!TLI.isLoadExtLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, VT, + MemVT, L->getAddressSpace())) + return SDValue(); + } + } // Check if the bytes of the OR we are looking at match with either big or // little endian value load @@ -13307,7 +13315,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD; if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() && SetCCWidth != 1 && SetCCWidth < WideWidth && - TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) && + TLI.isLoadExtLegalOrCustom( + LoadExtOpcode, WideVT, NarrowVT, + cast(LHS)->getAddressSpace()) && TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) { // Both compare operands can be widened for free. The LHS can use an // extended load, and the RHS is a constant: @@ -13754,8 +13764,10 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI, // Combine2), so we should conservatively check the OperationAction. LoadSDNode *Load1 = cast(Op1); LoadSDNode *Load2 = cast(Op2); - if (!TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load1->getMemoryVT()) || - !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT()) || + if (!TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load1->getMemoryVT(), + Load1->getAddressSpace()) || + !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT(), + Load2->getAddressSpace()) || (N0->getOpcode() == ISD::VSELECT && Level >= AfterLegalizeTypes && TLI.getOperationAction(ISD::VSELECT, VT) != TargetLowering::Legal)) return SDValue(); @@ -13979,13 +13991,15 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) { // Try to split the vector types to get down to legal types. EVT SplitSrcVT = SrcVT; EVT SplitDstVT = DstVT; - while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && + while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT, + LN0->getAddressSpace()) && SplitSrcVT.getVectorNumElements() > 1) { SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; } - if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) + if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT, + LN0->getAddressSpace())) return SDValue(); assert(!DstVT.isScalableVector() && "Unexpected scalable vector type"); @@ -14058,7 +14072,7 @@ SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) { return SDValue(); LoadSDNode *Load = cast(N1.getOperand(0)); EVT MemVT = Load->getMemoryVT(); - if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) || + if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT, Load->getAddressSpace()) || Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed()) return SDValue(); @@ -14168,7 +14182,7 @@ static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner, EVT MemVT = LN0->getMemoryVT(); if ((LegalOperations || !LN0->isSimple() || VT.isVector()) && - !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT)) + !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT, LN0->getAddressSpace())) return SDValue(); SDValue ExtLoad = @@ -14210,12 +14224,14 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner, } } + LoadSDNode *LN0 = cast(N0); // TODO: isFixedLengthVector() should be removed and any negative effects on // code generation being the result of that target's implementation of // isVectorLoadExtDesirable(). if ((LegalOperations || VT.isFixedLengthVector() || - !cast(N0)->isSimple()) && - !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())) + LN0->isSimple()) && + !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType(), + LN0->getAddressSpace())) return {}; bool DoXform = true; @@ -14227,7 +14243,6 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner, if (!DoXform) return {}; - LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); @@ -14258,7 +14273,8 @@ tryToFoldExtOfMaskedLoad(SelectionDAG &DAG, const TargetLowering &TLI, EVT VT, return SDValue(); if ((LegalOperations || !cast(N0)->isSimple()) && - !TLI.isLoadExtLegalOrCustom(ExtLoadType, VT, Ld->getValueType(0))) + !TLI.isLoadExtLegalOrCustom(ExtLoadType, VT, Ld->getValueType(0), + Ld->getAddressSpace())) return SDValue(); if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0))) @@ -14402,7 +14418,8 @@ SDValue DAGCombiner::foldSextSetcc(SDNode *N) { if (!(ISD::isNON_EXTLoad(V.getNode()) && ISD::isUNINDEXEDLoad(V.getNode()) && cast(V)->isSimple() && - TLI.isLoadExtLegal(LoadOpcode, VT, V.getValueType()))) + TLI.isLoadExtLegal(LoadOpcode, VT, V.getValueType(), + cast(V)->getAddressSpace()))) return false; // Non-chain users of this value must either be the setcc in this @@ -14599,7 +14616,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN00 = cast(N0.getOperand(0)); EVT MemVT = LN00->getMemoryVT(); - if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) && + if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT, LN00->getAddressSpace()) && LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) { SmallVector SetCCs; bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0), @@ -14917,7 +14934,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN00 = cast(N0.getOperand(0)); EVT MemVT = LN00->getMemoryVT(); - if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) && + if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT, LN00->getAddressSpace()) && LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) { bool DoXform = true; SmallVector SetCCs; @@ -15148,7 +15165,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { return foldedExt; } else if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && - TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) { + TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType(), + cast(N0)->getAddressSpace())) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) @@ -15183,7 +15201,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { LoadSDNode *LN0 = cast(N0); ISD::LoadExtType ExtType = LN0->getExtensionType(); EVT MemVT = LN0->getMemoryVT(); - if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) { + if (!LegalOperations || + TLI.isLoadExtLegal(ExtType, VT, MemVT, LN0->getAddressSpace())) { SDValue ExtLoad = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); @@ -15497,7 +15516,8 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) { EVT::getIntegerVT(*DAG.getContext(), ShiftMask.countr_one()); // If the mask is smaller, recompute the type. if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) && - TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) + TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT, + LN->getAddressSpace())) ExtVT = MaskedVT; } else if (ExtType == ISD::ZEXTLOAD && ShiftMask.isShiftedMask(Offset, ActiveBits) && @@ -15506,7 +15526,8 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) { // If the mask is shifted we can use a narrower load and a shl to insert // the trailing zeros. if (((Offset + ActiveBits) <= ExtVT.getScalarSizeInBits()) && - TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT)) { + TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT, + LN->getAddressSpace())) { ExtVT = MaskedVT; ShAmt = Offset + ShAmt; ShiftedOffset = Offset; @@ -15732,7 +15753,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { ExtVT == cast(N0)->getMemoryVT() && ((!LegalOperations && cast(N0)->isSimple() && N0.hasOneUse()) || - TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) { + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT, + cast(N0)->getAddressSpace()))) { auto *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), @@ -15747,7 +15769,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse() && ExtVT == cast(N0)->getMemoryVT() && ((!LegalOperations && cast(N0)->isSimple()) && - TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) { + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT, + cast(N0)->getAddressSpace()))) { auto *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), @@ -15762,7 +15785,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { if (MaskedLoadSDNode *Ld = dyn_cast(N0)) { if (ExtVT == Ld->getMemoryVT() && N0.hasOneUse() && Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD && - TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) { + TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT, + Ld->getAddressSpace())) { SDValue ExtMaskedLoad = DAG.getMaskedLoad( VT, DL, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(), @@ -19109,7 +19133,8 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && - TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) { + TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType(), + cast(N0)->getAddressSpace())) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, LN0->getChain(), @@ -22161,12 +22186,16 @@ bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl &StoreNodes, } else if (TLI.getTypeAction(Context, StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy); + unsigned AS = LoadNodes[i].MemNode->getAddressSpace(); if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG.getMachineFunction()) && - TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) && - TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) && - TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) && + TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy, + AS) && + TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy, + AS) && + TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy, + AS) && TLI.allowsMemoryAccess(Context, DL, StoreTy, *FirstInChain->getMemOperand(), &IsFastSt) && IsFastSt && diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index bcfc2c5dc9f83..f66ab797fea83 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -742,8 +742,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { // nice to have an effective generic way of getting these benefits... // Until such a way is found, don't insist on promoting i1 here. (SrcVT != MVT::i1 || - TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1) == - TargetLowering::Promote)) { + TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1, + LD->getAddressSpace()) == TargetLowering::Promote)) { // Promote to a byte-sized load if not loading an integral number of // bytes. For example, promote EXTLOAD:i20 -> EXTLOAD:i24. unsigned NewWidth = SrcVT.getStoreSizeInBits(); @@ -856,7 +856,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { } else { bool isCustom = false; switch (TLI.getLoadExtAction(ExtType, Node->getValueType(0), - SrcVT.getSimpleVT())) { + SrcVT.getSimpleVT(), LD->getAddressSpace())) { default: llvm_unreachable("This action is not supported yet!"); case TargetLowering::Custom: isCustom = true; @@ -884,13 +884,15 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) { case TargetLowering::Expand: { EVT DestVT = Node->getValueType(0); - if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT)) { + if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT, + LD->getAddressSpace())) { // If the source type is not legal, see if there is a legal extload to // an intermediate type that we can then extend further. EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT()); if ((LoadVT.isFloatingPoint() == SrcVT.isFloatingPoint()) && (TLI.isTypeLegal(SrcVT) || // Same as SrcVT == LoadVT? - TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT))) { + TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT, + LD->getAddressSpace()))) { // If we are loading a legal type, this is a non-extload followed by a // full extend. ISD::LoadExtType MidExtType = diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 8e423c4f83b38..be8e780a6f55d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -301,7 +301,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { ISD::LoadExtType ExtType = LD->getExtensionType(); EVT LoadedVT = LD->getMemoryVT(); if (LoadedVT.isVector() && ExtType != ISD::NON_EXTLOAD) - Action = TLI.getLoadExtAction(ExtType, LD->getValueType(0), LoadedVT); + Action = TLI.getLoadExtAction(ExtType, LD->getValueType(0), LoadedVT, + LD->getAddressSpace()); break; } case ISD::STORE: { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index fd6d20e146bb2..78d8353a5901e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -12364,7 +12364,8 @@ SDValue TargetLowering::scalarizeExtractedVectorLoad(EVT ResultVT, if (ResultVT.bitsGT(VecEltVT)) { // If the result type of vextract is wider than the load, then issue an // extending load instead. - ISD::LoadExtType ExtType = isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) + ISD::LoadExtType ExtType = isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT, + OriginalLoad->getAddressSpace()) ? ISD::ZEXTLOAD : ISD::EXTLOAD; Load = DAG.getExtLoad(ExtType, DL, ResultVT, OriginalLoad->getChain(), diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index c23281a820b2b..e9cdb72dd7eb2 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -728,10 +728,11 @@ TargetLoweringBase::~TargetLoweringBase() = default; void TargetLoweringBase::initActions() { // All operations default to being supported. memset(OpActions, 0, sizeof(OpActions)); - memset(LoadExtActions, 0, sizeof(LoadExtActions)); memset(TruncStoreActions, 0, sizeof(TruncStoreActions)); memset(IndexedModeActions, 0, sizeof(IndexedModeActions)); memset(CondCodeActions, 0, sizeof(CondCodeActions)); + LoadExtActions[0].fill({}); + llvm::fill(RegClassForVT, nullptr); llvm::fill(TargetDAGCombineArray, 0); diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index b8dd377377dab..0d1c452db019b 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -279,7 +279,6 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -302,7 +301,6 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: v_and_b32_e32 v0, 1, v0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -325,7 +323,6 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -340,7 +337,6 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext@rel32@hi+12 ; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -360,7 +356,6 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_zeroext@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: v_and_b32_e32 v0, 1, v0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i1, ptr addrspace(1) poison diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 38003f6075c35..0514b1cb38e1c 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -36,6 +36,7 @@ define zeroext i1 @i1_zeroext_func_void() #0 { ; GFX789-NEXT: s_mov_b32 s6, -1 ; GFX789-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 ; GFX789-NEXT: s_waitcnt vmcnt(0) +; GFX789-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX789-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: i1_zeroext_func_void: @@ -45,6 +46,7 @@ define zeroext i1 @i1_zeroext_func_void() #0 { ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %val = load i1, ptr addrspace(1) poison ret i1 %val diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll index f92ba7a8978b9..a0a760133e5c0 100644 --- a/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll +++ b/llvm/test/CodeGen/AMDGPU/global-extload-gfx11plus.ll @@ -55,27 +55,15 @@ define amdgpu_kernel void @sextload_global_i8_to_i16(ptr addrspace(1) %out, ptr } define amdgpu_kernel void @zextload_global_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind { -; GFX11-REAL16-LABEL: zextload_global_i8_to_i64: -; GFX11-REAL16: ; %bb.0: -; GFX11-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-REAL16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-REAL16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-REAL16-NEXT: global_load_d16_u8 v0, v1, s[2:3] -; GFX11-REAL16-NEXT: s_waitcnt vmcnt(0) -; GFX11-REAL16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-REAL16-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX11-REAL16-NEXT: s_endpgm -; -; GFX11-FAKE16-LABEL: zextload_global_i8_to_i64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: global_load_u8 v0, v1, s[2:3] -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX11-FAKE16-NEXT: s_endpgm +; GFX11-LABEL: zextload_global_i8_to_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_u8 v0, v1, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX11-NEXT: s_endpgm %a = load i8, ptr addrspace(1) %in %ext = zext i8 %a to i64 store i64 %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index 59dfd713ef4fd..a06692dce9f0d 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -4,11 +4,11 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-LABEL: InferNothing: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s7, s6, 31 ; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: s_ashr_i32 s7, s6, 31 ; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s2, s0 @@ -31,11 +31,11 @@ entry: define protected amdgpu_kernel void @InferFadd(i32 %a, ptr addrspace(1) %b, double %c) { ; CHECK-LABEL: InferFadd: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; CHECK-NEXT: s_load_dword s6, s[4:5], 0x24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s7, s6, 31 ; CHECK-NEXT: v_mov_b32_e32 v2, s2 +; CHECK-NEXT: s_ashr_i32 s7, s6, 31 ; CHECK-NEXT: v_mov_b32_e32 v3, s3 ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 5b2213592f495..770f7e06c383f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -645,6 +645,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -658,6 +659,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -683,6 +685,8 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_and_b32 s2, s2, 1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -693,6 +697,8 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm @@ -796,6 +802,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -809,6 +816,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -834,6 +842,8 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_and_b32 s2, s2, 1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm @@ -844,6 +854,8 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_and_b32 s2, s2, 1 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index b534c2c267fad..d935d0f28e442 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -5285,16 +5285,15 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i64: ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NOHSA-NEXT: flat_load_ubyte v0, v[0:1] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0) -; GFX8-NOHSA-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NOHSA-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_i8_to_i64: @@ -5314,27 +5313,15 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX12-TRUE16-LABEL: constant_zextload_i8_to_i64: -; GFX12-TRUE16: ; %bb.0: -; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_d16_u8 v0, v1, s[2:3] -; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-TRUE16-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-TRUE16-NEXT: s_endpgm -; -; GFX12-FAKE16-LABEL: constant_zextload_i8_to_i64: -; GFX12-FAKE16: ; %bb.0: -; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: global_load_u8 v0, v1, s[2:3] -; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-FAKE16-NEXT: s_endpgm +; GFX12-LABEL: constant_zextload_i8_to_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-NEXT: s_endpgm %a = load i8, ptr addrspace(4) %in %ext = zext i8 %a to i64 store i64 %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index f879dc660203f..a5be0a312898b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -5116,7 +5116,6 @@ define amdgpu_kernel void @global_zextload_i8_to_i64(ptr addrspace(1) %out, ptr ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll index 04a5cac116d78..d4b9f33e961d5 100644 --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -542,7 +542,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; SI-NEXT: s_load_dword s2, s[0:1], 0x0 ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s2, s2, 0xff +; SI-NEXT: s_and_b32 s2, s2, 1 ; SI-NEXT: s_addk_i32 s2, 0x3e7 ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s2, -1 @@ -558,7 +558,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s0, s[0:1], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s0, s0, 0xff +; VI-NEXT: s_and_b32 s0, s0, 1 ; VI-NEXT: s_addk_i32 s0, 0x3e7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -571,7 +571,7 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) % ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s0, s0, 0xff +; GFX11-NEXT: s_and_b32 s0, s0, 1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_addk_i32 s0, 0x3e7 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0