7 changes: 2 additions & 5 deletions llvm/lib/CodeGen/LowerEmuTLS.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,7 @@ bool addEmuTlsVar(Module &M, const GlobalVariable *GV) {
IntegerType *WordType = DL.getIntPtrType(C);
PointerType *InitPtrType = PointerType::getUnqual(C);
Type *ElementTypes[4] = {WordType, WordType, VoidPtrType, InitPtrType};
ArrayRef<Type*> ElementTypeArray(ElementTypes, 4);
StructType *EmuTlsVarType = StructType::create(ElementTypeArray);
StructType *EmuTlsVarType = StructType::create(ElementTypes);
EmuTlsVar = cast<GlobalVariable>(
M.getOrInsertGlobal(EmuTlsVarName, EmuTlsVarType));
copyLinkageVisibility(M, GV, EmuTlsVar);
Expand Down Expand Up @@ -170,9 +169,7 @@ bool addEmuTlsVar(Module &M, const GlobalVariable *GV) {
ConstantInt::get(WordType, DL.getTypeStoreSize(GVType)),
ConstantInt::get(WordType, GVAlignment.value()), NullPtr,
EmuTlsTmplVar ? EmuTlsTmplVar : NullPtr};
ArrayRef<Constant*> ElementValueArray(ElementValues, 4);
EmuTlsVar->setInitializer(
ConstantStruct::get(EmuTlsVarType, ElementValueArray));
EmuTlsVar->setInitializer(ConstantStruct::get(EmuTlsVarType, ElementValues));
Align MaxAlignment =
std::max(DL.getABITypeAlign(WordType), DL.getABITypeAlign(VoidPtrType));
EmuTlsVar->setAlignment(MaxAlignment);
Expand Down
28 changes: 7 additions & 21 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5577,9 +5577,12 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
return RMINMAX;

// Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
// Only do this if the current op isn't legal and the flipped is.
if (!TLI.isOperationLegal(Opcode, VT) &&
(N0.isUndef() || DAG.SignBitIsZero(N0)) &&
// Only do this if:
// 1. The current op isn't legal and the flipped is.
// 2. The saturation pattern is broken by canonicalization in InstCombine.
bool IsOpIllegal = !TLI.isOperationLegal(Opcode, VT);
bool IsSatBroken = Opcode == ISD::UMIN && N0.getOpcode() == ISD::SMAX;
if ((IsSatBroken || IsOpIllegal) && (N0.isUndef() || DAG.SignBitIsZero(N0)) &&
(N1.isUndef() || DAG.SignBitIsZero(N1))) {
unsigned AltOpcode;
switch (Opcode) {
Expand All @@ -5589,7 +5592,7 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
case ISD::UMAX: AltOpcode = ISD::SMAX; break;
default: llvm_unreachable("Unknown MINMAX opcode");
}
if (TLI.isOperationLegal(AltOpcode, VT))
if ((IsSatBroken && IsOpIllegal) || TLI.isOperationLegal(AltOpcode, VT))
return DAG.getNode(AltOpcode, DL, VT, N0, N1);
}

Expand Down Expand Up @@ -24464,23 +24467,6 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
if (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, NVT))
return DAG.getSplatVector(NVT, DL, V.getOperand(0));

// extract_subvector(insert_subvector(x,y,c1),c2)
// --> extract_subvector(y,c2-c1)
// iff we're just extracting from the inserted subvector.
if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
SDValue InsSub = V.getOperand(1);
EVT InsSubVT = InsSub.getValueType();
unsigned NumInsElts = InsSubVT.getVectorMinNumElements();
unsigned InsIdx = V.getConstantOperandVal(2);
unsigned NumSubElts = NVT.getVectorMinNumElements();
if (InsIdx <= ExtIdx && (ExtIdx + NumSubElts) <= (InsIdx + NumInsElts) &&
TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx)) {
SDLoc DL(N);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, InsSub,
DAG.getVectorIdxConstant(ExtIdx - InsIdx, DL));
}
}

// Try to move vector bitcast after extract_subv by scaling extraction index:
// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
if (V.getOpcode() == ISD::BITCAST &&
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,12 +572,12 @@ dwarf::findDebugNamesOffsets(uint64_t EndOfHeaderOffset,

Error DWARFDebugNames::NameIndex::extract() {
const DWARFDataExtractor &AS = Section.AccelSection;
uint64_t hdrSize = Base;
if (Error E = Hdr.extract(AS, &hdrSize))
uint64_t EndOfHeaderOffset = Base;
if (Error E = Hdr.extract(AS, &EndOfHeaderOffset))
return E;

const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
Offsets = dwarf::findDebugNamesOffsets(hdrSize, Hdr);
Offsets = dwarf::findDebugNamesOffsets(EndOfHeaderOffset, Hdr);

uint64_t Offset =
Offsets.EntryOffsetsBase + (Hdr.NameCount * SectionOffsetSize);
Expand Down
4 changes: 1 addition & 3 deletions llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4786,11 +4786,9 @@ OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {

void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
int32_t LB, int32_t UB) {
if (T.isNVPTX()) {
if (T.isNVPTX())
if (UB > 0)
updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
updateNVPTXMetadata(Kernel, "minctasm", LB, false);
}
if (T.isAMDGPU())
Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");

Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/IR/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3769,6 +3769,10 @@ static AtomicRMWInst::BinOp mapFromLLVMRMWBinOp(LLVMAtomicRMWBinOp BinOp) {
case LLVMAtomicRMWBinOpFSub: return AtomicRMWInst::FSub;
case LLVMAtomicRMWBinOpFMax: return AtomicRMWInst::FMax;
case LLVMAtomicRMWBinOpFMin: return AtomicRMWInst::FMin;
case LLVMAtomicRMWBinOpUIncWrap:
return AtomicRMWInst::UIncWrap;
case LLVMAtomicRMWBinOpUDecWrap:
return AtomicRMWInst::UDecWrap;
}

llvm_unreachable("Invalid LLVMAtomicRMWBinOp value!");
Expand All @@ -3791,6 +3795,10 @@ static LLVMAtomicRMWBinOp mapToLLVMRMWBinOp(AtomicRMWInst::BinOp BinOp) {
case AtomicRMWInst::FSub: return LLVMAtomicRMWBinOpFSub;
case AtomicRMWInst::FMax: return LLVMAtomicRMWBinOpFMax;
case AtomicRMWInst::FMin: return LLVMAtomicRMWBinOpFMin;
case AtomicRMWInst::UIncWrap:
return LLVMAtomicRMWBinOpUIncWrap;
case AtomicRMWInst::UDecWrap:
return LLVMAtomicRMWBinOpUDecWrap;
default: break;
}

Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/ProfileData/MemProf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ size_t IndexedAllocationInfo::serializedSize(IndexedVersion Version) const {
}

static size_t serializedSizeV0(const IndexedMemProfRecord &Record) {
size_t Result = sizeof(GlobalValue::GUID);
// The number of alloc sites to serialize.
size_t Result = sizeof(uint64_t);
for (const IndexedAllocationInfo &N : Record.AllocSites)
Result += N.serializedSize(Version0);

Expand All @@ -57,7 +58,8 @@ static size_t serializedSizeV0(const IndexedMemProfRecord &Record) {
}

static size_t serializedSizeV2(const IndexedMemProfRecord &Record) {
size_t Result = sizeof(GlobalValue::GUID);
// The number of alloc sites to serialize.
size_t Result = sizeof(uint64_t);
for (const IndexedAllocationInfo &N : Record.AllocSites)
Result += N.serializedSize(Version2);

Expand Down
8 changes: 3 additions & 5 deletions llvm/lib/Support/RISCVISAInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -935,7 +935,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,

// The canonical order specified in ISA manual.
// Ref: Table 22.1 in RISC-V User-Level ISA V2.2
StringRef StdExts = AllStdExts;
char Baseline = Arch[4];

// First letter should be 'e', 'i' or 'g'.
Expand All @@ -951,7 +950,6 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
if (Arch.size() > 5 && isDigit(Arch[5]))
return createStringError(errc::invalid_argument,
"version not supported for 'g'");
StdExts = StdExts.drop_front(4);
break;
}

Expand Down Expand Up @@ -1001,11 +999,11 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
Exts = Exts.drop_front(ConsumeLength);
Exts.consume_front("_");

std::vector<std::string> SplittedExts;
if (auto E = splitExtsByUnderscore(Exts, SplittedExts))
std::vector<std::string> SplitExts;
if (auto E = splitExtsByUnderscore(Exts, SplitExts))
return std::move(E);

for (auto &Ext : SplittedExts) {
for (auto &Ext : SplitExts) {
StringRef CurrExt = Ext;
while (!CurrExt.empty()) {
if (AllStdExts.contains(CurrExt.front())) {
Expand Down
5 changes: 1 addition & 4 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4325,10 +4325,7 @@ AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
MBB.addSuccessor(LoopMBB);
// Update liveins.
bool anyChange = false;
do {
anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
} while (anyChange);
fullyRecomputeLiveIns({ExitMBB, LoopMBB});

return ExitMBB->begin();
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15888,7 +15888,7 @@ unsigned AArch64TargetLowering::getNumInterleavedAccesses(
unsigned VecSize = 128;
unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
if (UseScalable)
if (UseScalable && isa<FixedVectorType>(VecTy))
VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
}
Expand Down
11 changes: 2 additions & 9 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9556,15 +9556,8 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
MBB.addSuccessor(LoopTestMBB);

// Update liveins.
if (MF.getRegInfo().reservedRegsFrozen()) {
bool anyChange = false;
do {
anyChange = recomputeLiveIns(*ExitMBB) ||
recomputeLiveIns(*LoopBodyMBB) ||
recomputeLiveIns(*LoopTestMBB);
} while (anyChange);
;
}
if (MF.getRegInfo().reservedRegsFrozen())
fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});

return ExitMBB->begin();
}
Expand Down
33 changes: 15 additions & 18 deletions llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1052,43 +1052,40 @@ class AMDGPULowerModuleLDS {
void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) {
KernelRoot->removeFnAttr("amdgpu-no-lds-kernel-id");

SmallVector<Function *> Tmp({CG[KernelRoot]->getFunction()});
if (!Tmp.back())
return;

SmallVector<Function *> WorkList({CG[KernelRoot]->getFunction()});
SmallPtrSet<Function *, 8> Visited;
bool SeenUnknownCall = false;

do {
Function *F = Tmp.pop_back_val();
while (!WorkList.empty()) {
Function *F = WorkList.pop_back_val();

for (auto &N : *CG[F]) {
if (!N.second)
for (auto &CallRecord : *CG[F]) {
if (!CallRecord.second)
continue;

Function *Callee = N.second->getFunction();
Function *Callee = CallRecord.second->getFunction();
if (!Callee) {
if (!SeenUnknownCall) {
SeenUnknownCall = true;

// If we see any indirect calls, assume nothing about potential
// targets.
// TODO: This could be refined to possible LDS global users.
for (auto &N : *CG.getExternalCallingNode()) {
Function *PotentialCallee = N.second->getFunction();
for (auto &ExternalCallRecord : *CG.getExternalCallingNode()) {
Function *PotentialCallee =
ExternalCallRecord.second->getFunction();
assert(PotentialCallee);
if (!isKernelLDS(PotentialCallee))
PotentialCallee->removeFnAttr("amdgpu-no-lds-kernel-id");
}

continue;
}
} else {
Callee->removeFnAttr("amdgpu-no-lds-kernel-id");
if (Visited.insert(Callee).second)
WorkList.push_back(Callee);
}

Callee->removeFnAttr("amdgpu-no-lds-kernel-id");
if (Visited.insert(Callee).second)
Tmp.push_back(Callee);
}
} while (!Tmp.empty());
}
}

DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
Expand Down
8 changes: 1 addition & 7 deletions llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1806,13 +1806,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
DFS.ProcessLoop();
const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
bool anyChange = false;
do {
anyChange = false;
for (auto *MBB : PostOrder) {
anyChange = recomputeLiveIns(*MBB) || anyChange;
}
} while (anyChange);
fullyRecomputeLiveIns(PostOrder);

for (auto *MBB : reverse(PostOrder))
recomputeLivenessFlags(*MBB);
Expand Down
11 changes: 2 additions & 9 deletions llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,10 +208,7 @@ bool PPCExpandAtomicPseudo::expandAtomicRMW128(
.addMBB(LoopMBB);
CurrentMBB->addSuccessor(LoopMBB);
CurrentMBB->addSuccessor(ExitMBB);
bool anyChange = false;
do {
anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
} while (anyChange);
fullyRecomputeLiveIns({ExitMBB, LoopMBB});
NMBBI = MBB.end();
MI.eraseFromParent();
return true;
Expand Down Expand Up @@ -288,11 +285,7 @@ bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128(
CurrentMBB->addSuccessor(LoopCmpMBB);
CurrentMBB->addSuccessor(ExitMBB);

bool anyChange = false;
do {
anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*CmpSuccMBB) ||
recomputeLiveIns(*LoopCmpMBB);
} while (anyChange);
fullyRecomputeLiveIns({ExitMBB, CmpSuccMBB, LoopCmpMBB});
NMBBI = MBB.end();
MI.eraseFromParent();
return true;
Expand Down
11 changes: 2 additions & 9 deletions llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1435,11 +1435,7 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB);
}
// Update liveins.
bool anyChange = false;
do {
anyChange = recomputeLiveIns(*ProbeExitMBB) ||
recomputeLiveIns(*ProbeLoopBodyMBB);
} while (anyChange);
fullyRecomputeLiveIns({ProbeExitMBB, ProbeLoopBodyMBB});
return ProbeExitMBB;
};
// For case HasBP && MaxAlign > 1, we have to realign the SP by performing
Expand Down Expand Up @@ -1531,10 +1527,7 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
}
// Update liveins.
bool anyChange = false;
do {
anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
} while (anyChange);
fullyRecomputeLiveIns({ExitMBB, LoopMBB});
}
}
++NumPrologProbed;
Expand Down
107 changes: 90 additions & 17 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1484,6 +1484,11 @@ bool RISCVTargetLowering::shouldExpandGetVectorLength(EVT TripCountVT,
return VF > MaxVF || !isPowerOf2_32(VF);
}

bool RISCVTargetLowering::shouldExpandCttzElements(EVT VT) const {
return !Subtarget.hasVInstructions() ||
VT.getVectorElementType() != MVT::i1 || !isTypeLegal(VT);
}

bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
MachineFunction &MF,
Expand Down Expand Up @@ -8718,6 +8723,29 @@ static SDValue lowerGetVectorLength(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res);
}

static SDValue lowerCttzElts(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget) {
SDValue Op0 = N->getOperand(1);
MVT OpVT = Op0.getSimpleValueType();
MVT ContainerVT = OpVT;
if (OpVT.isFixedLengthVector()) {
ContainerVT = getContainerForFixedLengthVector(DAG, OpVT, Subtarget);
Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget);
}
MVT XLenVT = Subtarget.getXLenVT();
SDLoc DL(N);
auto [Mask, VL] = getDefaultVLOps(OpVT, ContainerVT, DL, DAG, Subtarget);
SDValue Res = DAG.getNode(RISCVISD::VFIRST_VL, DL, XLenVT, Op0, Mask, VL);
if (isOneConstant(N->getOperand(2)))
return Res;

// Convert -1 to VL.
SDValue Setcc =
DAG.getSetCC(DL, XLenVT, Res, DAG.getConstant(0, DL, XLenVT), ISD::SETLT);
VL = DAG.getElementCount(DL, XLenVT, OpVT.getVectorElementCount());
return DAG.getSelect(DL, XLenVT, Setcc, VL, Res);
}

static inline void promoteVCIXScalar(const SDValue &Op,
SmallVectorImpl<SDValue> &Operands,
SelectionDAG &DAG) {
Expand Down Expand Up @@ -8913,6 +8941,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::experimental_get_vector_length:
return lowerGetVectorLength(Op.getNode(), DAG, Subtarget);
case Intrinsic::experimental_cttz_elts:
return lowerCttzElts(Op.getNode(), DAG, Subtarget);
case Intrinsic::riscv_vmv_x_s: {
SDValue Res = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Op.getOperand(1));
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
Expand Down Expand Up @@ -10403,14 +10433,10 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
MachineMemOperand *MMO = Load->getMemOperand();
MachineFunction &MF = DAG.getMachineFunction();
MMO = MF.getMachineMemOperand(
MMO, MMO->getPointerInfo(),
MMO->getMemoryType().isValid()
? LLT::scalable_vector(1, MMO->getMemoryType().getSizeInBits())
: MMO->getMemoryType());
SDValue NewLoad =
DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(), MMO);
DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(),
MMO->getPointerInfo(), MMO->getBaseAlign(), MMO->getFlags(),
MMO->getAAInfo(), MMO->getRanges());
SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
return DAG.getMergeValues({Result, NewLoad.getValue(1)}, DL);
}
Expand Down Expand Up @@ -10470,14 +10496,9 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
MachineMemOperand *MMO = Store->getMemOperand();
MachineFunction &MF = DAG.getMachineFunction();
MMO = MF.getMachineMemOperand(
MMO, MMO->getPointerInfo(),
MMO->getMemoryType().isValid()
? LLT::scalable_vector(1, MMO->getMemoryType().getSizeInBits())
: MMO->getMemoryType());
return DAG.getStore(Store->getChain(), DL, NewValue, Store->getBasePtr(),
MMO);
MMO->getPointerInfo(), MMO->getBaseAlign(),
MMO->getFlags(), MMO->getAAInfo());
}

SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG,
Expand Down Expand Up @@ -12336,6 +12357,12 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
return;
}
case Intrinsic::experimental_cttz_elts: {
SDValue Res = lowerCttzElts(N, DAG, Subtarget);
Results.push_back(
DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res));
return;
}
case Intrinsic::riscv_orc_b:
case Intrinsic::riscv_brev8:
case Intrinsic::riscv_sha256sig0:
Expand Down Expand Up @@ -13363,10 +13390,56 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
}

static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG) {
// Try to expand a scalar multiply to a faster sequence.
static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const RISCVSubtarget &Subtarget) {

EVT VT = N->getValueType(0);
if (!VT.isVector())

// LI + MUL is usually smaller than the alternative sequence.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();

if (VT != Subtarget.getXLenVT())
return SDValue();

if (!Subtarget.hasStdExtZba())
return SDValue();

ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!CNode)
return SDValue();
uint64_t MulAmt = CNode->getZExtValue();

// If this is a power 2 + 2/4/8, we can use a shift followed by a single
// shXadd. First check if this a sum of two power of 2s because that's
// easy. Then count how many zeros are up to the first bit.
if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
unsigned ScaleShift = llvm::countr_zero(MulAmt);
if (ScaleShift >= 1 && ScaleShift < 4) {
unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
SDLoc DL(N);
SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(ShiftAmt, DL, VT));
SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
DAG.getConstant(ScaleShift, DL, VT));
return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
}
}
return SDValue();
}


static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const RISCVSubtarget &Subtarget) {
EVT VT = N->getValueType(0);
if (!VT.isVector())
return expandMul(N, DAG, DCI, Subtarget);

SDLoc DL(N);
SDValue N0 = N->getOperand(0);
Expand Down Expand Up @@ -15913,7 +15986,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MUL:
if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
return V;
return performMULCombine(N, DAG);
return performMULCombine(N, DAG, DCI, Subtarget);
case ISD::SDIV:
case ISD::UDIV:
case ISD::SREM:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/RISCV/RISCVISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,8 @@ class RISCVTargetLowering : public TargetLowering {
bool shouldExpandGetVectorLength(EVT TripCountVT, unsigned VF,
bool IsScalable) const override;

bool shouldExpandCttzElements(EVT VT) const override;

/// RVV code generation for fixed length vectors does not lower all
/// BUILD_VECTORs. This makes BUILD_VECTOR legalisation a source of stores to
/// merge. However, merging them creates a BUILD_VECTOR that is just as
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1986,7 +1986,7 @@ genShXAddAddShift(MachineInstr &Root, unsigned AddOpIdx,
MRI.getUniqueVRegDef(AddMI->getOperand(AddOpIdx).getReg());

unsigned InnerShiftAmt = ShiftMI->getOperand(2).getImm();
assert(InnerShiftAmt > OuterShiftAmt && "Unexpected shift amount");
assert(InnerShiftAmt >= OuterShiftAmt && "Unexpected shift amount");

unsigned InnerOpc;
switch (InnerShiftAmt - OuterShiftAmt) {
Expand Down
48 changes: 36 additions & 12 deletions llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
Original file line number Diff line number Diff line change
Expand Up @@ -307,44 +307,68 @@ multiclass VPseudoVC_X<LMULInfo m, DAGOperand RS1Class,
Operand OpClass = payload2> {
let VLMul = m.value in {
let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_X<OpClass, RS1Class>;
def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>;
def "PseudoVC_" # NAME # "_SE_" # m.MX
: VPseudoVC_X<OpClass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
def "PseudoVC_V_" # NAME # "_SE_" # m.MX
: VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
}
def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>;
def "PseudoVC_V_" # NAME # "_" # m.MX
: VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
}
}

multiclass VPseudoVC_XV<LMULInfo m, DAGOperand RS1Class,
Operand OpClass = payload2> {
let VLMul = m.value in {
let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XV<OpClass, m.vrclass, RS1Class>;
def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>;
def "PseudoVC_" # NAME # "_SE_" # m.MX
: VPseudoVC_XV<OpClass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
def "PseudoVC_V_" # NAME # "_SE_" # m.MX
: VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
}
def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>;
def "PseudoVC_V_" # NAME # "_" # m.MX
: VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
}
}

multiclass VPseudoVC_XVV<LMULInfo m, DAGOperand RS1Class,
Operand OpClass = payload2> {
let VLMul = m.value in {
let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
def "PseudoVC_" # NAME # "_SE_" # m.MX
: VPseudoVC_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
def "PseudoVC_V_" # NAME # "_SE_" # m.MX
: VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
}
def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
def "PseudoVC_V_" # NAME # "_" # m.MX
: VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
}
}

multiclass VPseudoVC_XVW<LMULInfo m, DAGOperand RS1Class,
Operand OpClass = payload2> {
let VLMul = m.value in {
let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in
def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
def "PseudoVC_" # NAME # "_SE_" # m.MX
: VPseudoVC_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
let Constraints = "@earlyclobber $rd, $rd = $rs3" in {
let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in
def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
def "PseudoVC_V_" # NAME # "_SE_" # m.MX
: VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
def "PseudoVC_V_" # NAME # "_" # m.MX
: VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>,
Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
}
}
}
Expand Down
10 changes: 2 additions & 8 deletions llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,7 @@ class CMOPInst<bits<3> imm3, string opcodestr>
let Inst{12-11} = 0;
}

// CMOP1, CMOP5 is used by Zicfiss.
let Predicates = [HasStdExtZcmop, NoHasStdExtZicfiss] in {
def CMOP1 : CMOPInst<0, "cmop.1">, Sched<[]>;
def CMOP5 : CMOPInst<2, "cmop.5">, Sched<[]>;
}

foreach n = [3, 7, 9, 11, 13, 15] in {
foreach n = [1, 3, 5, 7, 9, 11, 13, 15] in {
let Predicates = [HasStdExtZcmop] in
def CMOP # n : CMOPInst<!srl(n, 1), "cmop." # n>, Sched<[]>;
def C_MOP # n : CMOPInst<!srl(n, 1), "c.mop." # n>, Sched<[]>;
}
2 changes: 1 addition & 1 deletion llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ let Predicates = [HasStdExtZvknhaOrZvknhb], RVVConstraint = Sha2Constraint in {
def VSHA2MS_VV : PALUVVNoVmTernary<0b101101, OPMVV, "vsha2ms.vv">;
} // Predicates = [HasStdExtZvknhaOrZvknhb]

let Predicates = [HasStdExtZvkned]in {
let Predicates = [HasStdExtZvkned] in {
defm VAESDF : VAES_MV_V_S<0b101000, 0b101001, 0b00001, OPMVV, "vaesdf">;
defm VAESDM : VAES_MV_V_S<0b101000, 0b101001, 0b00000, OPMVV, "vaesdm">;
defm VAESEF : VAES_MV_V_S<0b101000, 0b101001, 0b00011, OPMVV, "vaesef">;
Expand Down
27 changes: 25 additions & 2 deletions llvm/lib/Target/RISCV/RISCVProcessors.td
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,13 @@ class RISCVTuneProcessorModel<string n,

def GENERIC_RV32 : RISCVProcessorModel<"generic-rv32",
NoSchedModel,
[Feature32Bit]>,
[Feature32Bit,
FeatureStdExtI]>,
GenericTuneInfo;
def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64",
NoSchedModel,
[Feature64Bit]>,
[Feature64Bit,
FeatureStdExtI]>,
GenericTuneInfo;
// Support generic for compatibility with other targets. The triple will be used
// to change to the appropriate rv32/rv64 version.
Expand All @@ -69,11 +71,13 @@ def : ProcessorModel<"generic", NoSchedModel, []>, GenericTuneInfo;
def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32",
RocketModel,
[Feature32Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtZicsr]>;
def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
RocketModel,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtZicsr]>;
def ROCKET : RISCVTuneProcessorModel<"rocket",
Expand All @@ -86,6 +90,7 @@ def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series",
def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
RocketModel,
[Feature32Bit,
FeatureStdExtI,
FeatureStdExtZicsr,
FeatureStdExtZifencei,
FeatureStdExtM,
Expand All @@ -94,6 +99,7 @@ def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
def SIFIVE_E21 : RISCVProcessorModel<"sifive-e21",
RocketModel,
[Feature32Bit,
FeatureStdExtI,
FeatureStdExtZicsr,
FeatureStdExtZifencei,
FeatureStdExtM,
Expand All @@ -103,6 +109,7 @@ def SIFIVE_E21 : RISCVProcessorModel<"sifive-e21",
def SIFIVE_E24 : RISCVProcessorModel<"sifive-e24",
RocketModel,
[Feature32Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand All @@ -112,6 +119,7 @@ def SIFIVE_E24 : RISCVProcessorModel<"sifive-e24",
def SIFIVE_E31 : RISCVProcessorModel<"sifive-e31",
RocketModel,
[Feature32Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtZicsr,
FeatureStdExtM,
Expand All @@ -121,6 +129,7 @@ def SIFIVE_E31 : RISCVProcessorModel<"sifive-e31",
def SIFIVE_E34 : RISCVProcessorModel<"sifive-e34",
RocketModel,
[Feature32Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand All @@ -130,6 +139,7 @@ def SIFIVE_E34 : RISCVProcessorModel<"sifive-e34",
def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
SiFive7Model,
[Feature32Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand All @@ -140,6 +150,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
RocketModel,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZicsr,
FeatureStdExtZifencei,
FeatureStdExtM,
Expand All @@ -149,6 +160,7 @@ def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
def SIFIVE_S51 : RISCVProcessorModel<"sifive-s51",
RocketModel,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZicsr,
FeatureStdExtZifencei,
FeatureStdExtM,
Expand All @@ -158,6 +170,7 @@ def SIFIVE_S51 : RISCVProcessorModel<"sifive-s51",
def SIFIVE_S54 : RISCVProcessorModel<"sifive-s54",
RocketModel,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand All @@ -168,6 +181,7 @@ def SIFIVE_S54 : RISCVProcessorModel<"sifive-s54",
def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
SiFive7Model,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand All @@ -180,6 +194,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
RocketModel,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand All @@ -190,6 +205,7 @@ def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
SiFive7Model,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand All @@ -200,6 +216,7 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",

def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand All @@ -217,6 +234,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,

def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand Down Expand Up @@ -247,6 +265,7 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,

def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtM,
FeatureStdExtA,
Expand Down Expand Up @@ -286,6 +305,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
SyntacoreSCR1Model,
[Feature32Bit,
FeatureStdExtI,
FeatureStdExtZicsr,
FeatureStdExtZifencei,
FeatureStdExtC],
Expand All @@ -294,6 +314,7 @@ def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max",
SyntacoreSCR1Model,
[Feature32Bit,
FeatureStdExtI,
FeatureStdExtZicsr,
FeatureStdExtZifencei,
FeatureStdExtM,
Expand All @@ -303,6 +324,7 @@ def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max",
def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
NoSchedModel,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZifencei,
FeatureStdExtZicsr,
FeatureStdExtZicntr,
Expand Down Expand Up @@ -332,6 +354,7 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
XiangShanNanHuModel,
[Feature64Bit,
FeatureStdExtI,
FeatureStdExtZicsr,
FeatureStdExtZifencei,
FeatureStdExtM,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/RISCV/RISCVSchedRocket.td
Original file line number Diff line number Diff line change
Expand Up @@ -261,4 +261,5 @@ defm : UnsupportedSchedZbkx;
defm : UnsupportedSchedZfa;
defm : UnsupportedSchedZfh;
defm : UnsupportedSchedSFB;
defm : UnsupportedSchedXsfvcp;
}
48 changes: 48 additions & 0 deletions llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
Original file line number Diff line number Diff line change
Expand Up @@ -962,6 +962,54 @@ let Latency = 3 in

def : InstRW<[WriteIALU], (instrs COPY)>;

// VCIX
//
// In principle we don't know the latency of any VCIX instructions. But instead
// of taking the default of 1, which can lead to issues [1], we assume that they
// have a fairly high latency.
//
// [1] https://github.com/llvm/llvm-project/issues/83391
foreach mx = SchedMxList in {
defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
let Latency = !mul(Cycles, 10),
AcquireAtCycles = [0, 1],
ReleaseAtCycles = [1, !add(1, Cycles)] in {
defm "" : LMULWriteResMX<"WriteVC_V_I", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_X", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_IV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_XV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
foreach f = ["FPR16", "FPR32", "FPR64"] in {
defm "" : LMULWriteResMX<"WriteVC_V_" # f # "V", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
}
defm "" : LMULWriteResMX<"WriteVC_I", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_X", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_IV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_XV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
foreach f = ["FPR16", "FPR32", "FPR64"] in {
defm "" : LMULWriteResMX<"WriteVC_" # f # "V", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVC_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
}
}
}

//===----------------------------------------------------------------------===//

// Bypass and advance
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
Original file line number Diff line number Diff line change
Expand Up @@ -366,4 +366,5 @@ defm : UnsupportedSchedZbkx;
defm : UnsupportedSchedSFB;
defm : UnsupportedSchedZfa;
defm : UnsupportedSchedV;
defm : UnsupportedSchedXsfvcp;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
Original file line number Diff line number Diff line change
Expand Up @@ -1040,4 +1040,5 @@ defm : UnsupportedSchedZbkb;
defm : UnsupportedSchedZbkx;
defm : UnsupportedSchedSFB;
defm : UnsupportedSchedZfa;
defm : UnsupportedSchedXsfvcp;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
Original file line number Diff line number Diff line change
Expand Up @@ -212,4 +212,5 @@ defm : UnsupportedSchedZbkb;
defm : UnsupportedSchedZbkx;
defm : UnsupportedSchedZfa;
defm : UnsupportedSchedZfh;
defm : UnsupportedSchedXsfvcp;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
Original file line number Diff line number Diff line change
Expand Up @@ -311,4 +311,5 @@ defm : UnsupportedSchedZfa;
defm : UnsupportedSchedZfh;
defm : UnsupportedSchedSFB;
defm : UnsupportedSchedZabha;
defm : UnsupportedSchedXsfvcp;
}
1 change: 1 addition & 0 deletions llvm/lib/Target/RISCV/RISCVSchedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -296,3 +296,4 @@ def : ReadAdvance<ReadAtomicHD, 0>;
// Include the scheduler resources for other instruction extensions.
include "RISCVScheduleZb.td"
include "RISCVScheduleV.td"
include "RISCVScheduleXSf.td"
59 changes: 59 additions & 0 deletions llvm/lib/Target/RISCV/RISCVScheduleXSf.td
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
//===-- RISCVScheduleXSf.td - Scheduling Definitions XSf ---*- tablegen -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file describes the scheduling information for SiFive extensions.
//
//===----------------------------------------------------------------------===//

multiclass LMULSchedWritesVCIX<string id>{
defm "" : LMULSchedWrites<"WriteVC_" # id>;
defm "" : LMULSchedWrites<"WriteVC_V_" # id>;
}

defm "" : LMULSchedWritesVCIX<"I">;
defm "" : LMULSchedWritesVCIX<"X">;
defm "" : LMULSchedWritesVCIX<"IV">;
defm "" : LMULSchedWritesVCIX<"VV">;
defm "" : LMULSchedWritesVCIX<"XV">;
defm "" : LMULSchedWritesVCIX<"IVV">;
defm "" : LMULSchedWritesVCIX<"IVW">;
defm "" : LMULSchedWritesVCIX<"VVV">;
defm "" : LMULSchedWritesVCIX<"VVW">;
defm "" : LMULSchedWritesVCIX<"XVV">;
defm "" : LMULSchedWritesVCIX<"XVW">;
foreach f = ["FPR16", "FPR32", "FPR64"] in {
defm "" : LMULSchedWritesVCIX<f # "V">;
defm "" : LMULSchedWritesVCIX<f # "VV">;
defm "" : LMULSchedWritesVCIX<f # "VW">;
}

multiclass LMULWriteResVCIX<string id, list<ProcResourceKind> resources>{
defm : LMULWriteRes<"WriteVC_" # id, resources>;
defm : LMULWriteRes<"WriteVC_V_" # id, resources>;
}

multiclass UnsupportedSchedXsfvcp {
let Unsupported = true in {
defm : LMULWriteResVCIX<"I", []>;
defm : LMULWriteResVCIX<"X", []>;
defm : LMULWriteResVCIX<"IV", []>;
defm : LMULWriteResVCIX<"VV", []>;
defm : LMULWriteResVCIX<"XV", []>;
defm : LMULWriteResVCIX<"IVV", []>;
defm : LMULWriteResVCIX<"IVW", []>;
defm : LMULWriteResVCIX<"VVV", []>;
defm : LMULWriteResVCIX<"VVW", []>;
defm : LMULWriteResVCIX<"XVV", []>;
defm : LMULWriteResVCIX<"XVW", []>;
foreach f = ["FPR16", "FPR32", "FPR64"] in {
defm : LMULWriteResVCIX<f # "V", []>;
defm : LMULWriteResVCIX<f # "VV", []>;
defm : LMULWriteResVCIX<f # "VW", []>;
}
}
}
17 changes: 9 additions & 8 deletions llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1335,8 +1335,8 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
I);

std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
if (CondTy->isVectorTy()) {
if (ValTy->getScalarSizeInBits() == 1) {
// vmandn.mm v8, v8, v9
Expand Down Expand Up @@ -1375,14 +1375,15 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
LT.second, CostKind);
}

if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
ValTy->isVectorTy()) {
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);

// Support natively.
if (CmpInst::isIntPredicate(VecPred))
return LT.first * 1;
if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
CmpInst::isIntPredicate(VecPred)) {
// Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
// provided they incur the same cost across all implementations
return LT.first *
getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
}

if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy()) {
// If we do not support the input floating point vector type, use the base
// one which will calculate as:
// ScalarizeCost + Num * Cost for fixed vector,
Expand Down
10 changes: 2 additions & 8 deletions llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -824,10 +824,7 @@ void SystemZELFFrameLowering::inlineStackProbe(
StackAllocMI->eraseFromParent();
if (DoneMBB != nullptr) {
// Compute the live-in lists for the new blocks.
bool anyChange = false;
do {
anyChange = recomputeLiveIns(*DoneMBB) || recomputeLiveIns(*LoopMBB);
} while (anyChange);
fullyRecomputeLiveIns({DoneMBB, LoopMBB});
}
}

Expand Down Expand Up @@ -1425,10 +1422,7 @@ void SystemZXPLINKFrameLowering::inlineStackProbe(
StackAllocMI->eraseFromParent();

// Compute the live-in lists for the new blocks.
bool anyChange = false;
do {
anyChange = recomputeLiveIns(*StackExtMBB) || recomputeLiveIns(*NextMBB);
} while (anyChange);
fullyRecomputeLiveIns({StackExtMBB, NextMBB});
}

bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const {
Expand Down
11 changes: 2 additions & 9 deletions llvm/lib/Target/X86/X86FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -885,10 +885,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
}

// Update Live In information
bool anyChange = false;
do {
anyChange = recomputeLiveIns(*tailMBB) || recomputeLiveIns(*testMBB);
} while (anyChange);
fullyRecomputeLiveIns({tailMBB, testMBB});
}

void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
Expand Down Expand Up @@ -1380,11 +1377,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
footMBB->addSuccessor(&MBB);
}

bool anyChange = false;
do {
anyChange = recomputeLiveIns(*footMBB) || recomputeLiveIns(*bodyMBB) ||
recomputeLiveIns(*headMBB) || recomputeLiveIns(MBB);
} while (anyChange);
fullyRecomputeLiveIns({footMBB, bodyMBB, headMBB, &MBB});
}
} else {
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3408,6 +3408,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
return I;
break;
}
case Intrinsic::threadlocal_address: {
Align MinAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
MaybeAlign Align = II->getRetAlign();
if (MinAlign > Align.valueOrOne()) {
II->addRetAttr(Attribute::getWithAlignment(II->getContext(), MinAlign));
return II;
}
break;
}
default: {
// Handle target specific intrinsics
std::optional<Instruction *> V = targetInstCombineIntrinsic(*II);
Expand Down
8 changes: 8 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8097,6 +8097,14 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
}

// fcmp (fadd X, 0.0), Y --> fcmp X, Y
if (match(Op0, m_FAdd(m_Value(X), m_AnyZeroFP())))
return new FCmpInst(Pred, X, Op1, "", &I);

// fcmp X, (fadd Y, 0.0) --> fcmp X, Y
if (match(Op1, m_FAdd(m_Value(Y), m_AnyZeroFP())))
return new FCmpInst(Pred, Op0, Y, "", &I);

if (match(Op0, m_FPExt(m_Value(X)))) {
// fcmp (fpext X), (fpext Y) -> fcmp X, Y
if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
Expand Down
17 changes: 5 additions & 12 deletions llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,19 +319,12 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
}

// abs(X) * abs(X) -> X * X
// nabs(X) * nabs(X) -> X * X
if (Op0 == Op1) {
Value *X, *Y;
SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
if (SPF == SPF_ABS || SPF == SPF_NABS)
return BinaryOperator::CreateMul(X, X);

if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
return BinaryOperator::CreateMul(X, X);
}
Value *X;
if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
return BinaryOperator::CreateMul(X, X);

{
Value *X, *Y;
Value *Y;
// abs(X) * abs(Y) -> abs(X * Y)
if (I.hasNoSignedWrap() &&
match(Op0,
Expand All @@ -344,7 +337,7 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
}

// -X * C --> X * -C
Value *X, *Y;
Value *Y;
Constant *Op1C;
if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C)))
return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C));
Expand Down
40 changes: 22 additions & 18 deletions llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1153,6 +1153,7 @@ class WidenIV {

Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter,
PHINode *OrigPhi, PHINode *WidePhi);
void truncateIVUse(NarrowIVDefUse DU);

bool widenLoopCompare(NarrowIVDefUse DU);
bool widenWithVariantUse(NarrowIVDefUse DU);
Expand Down Expand Up @@ -1569,15 +1570,18 @@ WidenIV::WidenedRecTy WidenIV::getWideRecurrence(WidenIV::NarrowIVDefUse DU) {

/// This IV user cannot be widened. Replace this use of the original narrow IV
/// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
static void truncateIVUse(WidenIV::NarrowIVDefUse DU, DominatorTree *DT,
LoopInfo *LI) {
void WidenIV::truncateIVUse(NarrowIVDefUse DU) {
auto *InsertPt = getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI);
if (!InsertPt)
return;
LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user "
<< *DU.NarrowUse << "\n");
ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
IRBuilder<> Builder(InsertPt);
Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
Value *Trunc =
Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType(), "",
DU.NeverNegative || ExtKind == ExtendKind::Zero,
DU.NeverNegative || ExtKind == ExtendKind::Sign);
DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
}

Expand Down Expand Up @@ -1826,14 +1830,21 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
assert(ExtendKindMap.count(DU.NarrowDef) &&
"Should already know the kind of extension used to widen NarrowDef");

// This narrow use can be widened by a sext if it's non-negative or its narrow
// def was widened by a sext. Same for zext.
bool CanWidenBySExt =
DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Sign;
bool CanWidenByZExt =
DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Zero;

// Stop traversing the def-use chain at inner-loop phis or post-loop phis.
if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
if (LI->getLoopFor(UsePhi->getParent()) != L) {
// For LCSSA phis, sink the truncate outside the loop.
// After SimplifyCFG most loop exit targets have a single predecessor.
// Otherwise fall back to a truncate within the loop.
if (UsePhi->getNumOperands() != 1)
truncateIVUse(DU, DT, LI);
truncateIVUse(DU);
else {
// Widening the PHI requires us to insert a trunc. The logical place
// for this trunc is in the same BB as the PHI. This is not possible if
Expand All @@ -1847,7 +1858,8 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
BasicBlock *WidePhiBB = WidePhi->getParent();
IRBuilder<> Builder(WidePhiBB, WidePhiBB->getFirstInsertionPt());
Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType(), "",
CanWidenByZExt, CanWidenBySExt);
UsePhi->replaceAllUsesWith(Trunc);
DeadInsts.emplace_back(UsePhi);
LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to "
Expand All @@ -1857,26 +1869,18 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
}
}

// This narrow use can be widened by a sext if it's non-negative or its narrow
// def was widened by a sext. Same for zext.
auto canWidenBySExt = [&]() {
return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Sign;
};
auto canWidenByZExt = [&]() {
return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Zero;
};

// Our raison d'etre! Eliminate sign and zero extension.
if ((match(DU.NarrowUse, m_SExtLike(m_Value())) && canWidenBySExt()) ||
(isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
if ((match(DU.NarrowUse, m_SExtLike(m_Value())) && CanWidenBySExt) ||
(isa<ZExtInst>(DU.NarrowUse) && CanWidenByZExt)) {
Value *NewDef = DU.WideDef;
if (DU.NarrowUse->getType() != WideType) {
unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
unsigned IVWidth = SE->getTypeSizeInBits(WideType);
if (CastWidth < IVWidth) {
// The cast isn't as wide as the IV, so insert a Trunc.
IRBuilder<> Builder(DU.NarrowUse);
NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType(), "",
CanWidenByZExt, CanWidenBySExt);
}
else {
// A wider extend was hidden behind a narrower one. This may induce
Expand Down Expand Up @@ -1975,7 +1979,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
// This user does not evaluate to a recurrence after widening, so don't
// follow it. Instead insert a Trunc to kill off the original use,
// eventually isolating the original narrow IV so it can be removed.
truncateIVUse(DU, DT, LI);
truncateIVUse(DU);
return nullptr;
}

Expand Down
27 changes: 23 additions & 4 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ static cl::opt<int>
cl::desc("Only vectorize if you gain more than this "
"number "));

static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
"slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
"heuristics and makes vectorization decision via cost modeling."));

static cl::opt<bool>
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
cl::desc("Attempt to vectorize horizontal reductions"));
Expand Down Expand Up @@ -5509,6 +5514,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

void BoUpSLP::buildExternalUses(
const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
DenseMap<Value *, unsigned> ScalarToExtUses;
// Collect the values that we need to extract from the tree.
for (auto &TEPtr : VectorizableTree) {
TreeEntry *Entry = TEPtr.get();
Expand All @@ -5522,13 +5528,18 @@ void BoUpSLP::buildExternalUses(
Value *Scalar = Entry->Scalars[Lane];
if (!isa<Instruction>(Scalar))
continue;
int FoundLane = Entry->findLaneForValue(Scalar);
// All uses must be replaced already? No need to do it again.
auto It = ScalarToExtUses.find(Scalar);
if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
continue;

// Check if the scalar is externally used as an extra arg.
const auto *ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
int FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< Lane << " from " << *Scalar << ".\n");
<< FoundLane << " from " << *Scalar << ".\n");
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
}
for (User *U : Scalar->users()) {
Expand Down Expand Up @@ -5556,12 +5567,20 @@ void BoUpSLP::buildExternalUses(
continue;
}
U = nullptr;
if (It != ScalarToExtUses.end()) {
ExternalUses[It->second].User = nullptr;
break;
}
}

int FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
<< " from lane " << Lane << " from " << *Scalar
<< " from lane " << FoundLane << " from " << *Scalar
<< ".\n");
It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
ExternalUses.emplace_back(Scalar, U, FoundLane);
if (!U)
break;
}
}
}
Expand Down Expand Up @@ -6250,7 +6269,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
return TreeEntry::NeedToGather;
}
if (!areAltOperandsProfitable(S, VL)) {
if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
LLVM_DEBUG(
dbgs()
<< "SLP: ShuffleVector not vectorized, operands are buildvector and "
Expand Down
320 changes: 160 additions & 160 deletions llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions llvm/test/Bindings/llvm-c/atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,31 @@ define void @atomic_load_store(ptr %word) {
ret void
}

define void @atomic_rmw_ops(ptr %p, i32 %i, float %f) {
; Test all atomicrmw operations
%a.xchg = atomicrmw xchg ptr %p, i32 %i acq_rel, align 8
%a.add = atomicrmw add ptr %p, i32 %i acq_rel, align 8
%a.sub = atomicrmw sub ptr %p, i32 %i acq_rel, align 8
%a.and = atomicrmw and ptr %p, i32 %i acq_rel, align 8
%a.nand = atomicrmw nand ptr %p, i32 %i acq_rel, align 8
%a.or = atomicrmw or ptr %p, i32 %i acq_rel, align 8
%a.xor = atomicrmw xor ptr %p, i32 %i acq_rel, align 8
%a.max = atomicrmw max ptr %p, i32 %i acq_rel, align 8
%a.min = atomicrmw min ptr %p, i32 %i acq_rel, align 8
%a.umax = atomicrmw umax ptr %p, i32 %i acq_rel, align 8
%a.umin = atomicrmw umin ptr %p, i32 %i acq_rel, align 8

%a.fadd = atomicrmw fadd ptr %p, float %f acq_rel, align 8
%a.fsub = atomicrmw fsub ptr %p, float %f acq_rel, align 8
%a.fmax = atomicrmw fmax ptr %p, float %f acq_rel, align 8
%a.fmin = atomicrmw fmin ptr %p, float %f acq_rel, align 8

%a.uinc_wrap = atomicrmw uinc_wrap ptr %p, i32 %i acq_rel, align 8
%a.udec_wrap = atomicrmw udec_wrap ptr %p, i32 %i acq_rel, align 8

ret void
}

define i32 @main() {
%1 = alloca i32, align 4
%2 = cmpxchg ptr %1, i32 2, i32 3 seq_cst acquire
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AMDGPU/umed3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(ptr addrspace(1) %ou
}

; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32:
; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid
Expand Down
80 changes: 35 additions & 45 deletions llvm/test/CodeGen/ARM/usat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ define i32 @mm_unsigned_sat_upper_lower_1(i32 %x) {
; V4T-NEXT: bic r1, r0, r0, asr #31
; V4T-NEXT: ldr r0, .LCPI20_0
; V4T-NEXT: cmp r1, r0
; V4T-NEXT: movlo r0, r1
; V4T-NEXT: movlt r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
Expand All @@ -765,23 +765,12 @@ define i32 @mm_unsigned_sat_upper_lower_1(i32 %x) {
;
; V6-LABEL: mm_unsigned_sat_upper_lower_1:
; V6: @ %bb.0: @ %entry
; V6-NEXT: bic r1, r0, r0, asr #31
; V6-NEXT: ldr r0, .LCPI20_0
; V6-NEXT: cmp r1, r0
; V6-NEXT: movlo r0, r1
; V6-NEXT: usat r0, #23, r0
; V6-NEXT: bx lr
; V6-NEXT: .p2align 2
; V6-NEXT: @ %bb.1:
; V6-NEXT: .LCPI20_0:
; V6-NEXT: .long 8388607 @ 0x7fffff
;
; V6T2-LABEL: mm_unsigned_sat_upper_lower_1:
; V6T2: @ %bb.0: @ %entry
; V6T2-NEXT: bic r1, r0, r0, asr #31
; V6T2-NEXT: movw r0, #65535
; V6T2-NEXT: movt r0, #127
; V6T2-NEXT: cmp r1, r0
; V6T2-NEXT: movlo r0, r1
; V6T2-NEXT: usat r0, #23, r0
; V6T2-NEXT: bx lr
entry:
%0 = call i32 @llvm.smax.i32(i32 %x, i32 0)
Expand All @@ -795,7 +784,7 @@ define i32 @mm_unsigned_sat_upper_lower_2(i32 %x) {
; V4T-NEXT: bic r1, r0, r0, asr #31
; V4T-NEXT: ldr r0, .LCPI21_0
; V4T-NEXT: cmp r1, r0
; V4T-NEXT: movlo r0, r1
; V4T-NEXT: movlt r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
Expand All @@ -804,23 +793,12 @@ define i32 @mm_unsigned_sat_upper_lower_2(i32 %x) {
;
; V6-LABEL: mm_unsigned_sat_upper_lower_2:
; V6: @ %bb.0: @ %entry
; V6-NEXT: bic r1, r0, r0, asr #31
; V6-NEXT: ldr r0, .LCPI21_0
; V6-NEXT: cmp r1, r0
; V6-NEXT: movlo r0, r1
; V6-NEXT: usat r0, #23, r0
; V6-NEXT: bx lr
; V6-NEXT: .p2align 2
; V6-NEXT: @ %bb.1:
; V6-NEXT: .LCPI21_0:
; V6-NEXT: .long 8388607 @ 0x7fffff
;
; V6T2-LABEL: mm_unsigned_sat_upper_lower_2:
; V6T2: @ %bb.0: @ %entry
; V6T2-NEXT: bic r1, r0, r0, asr #31
; V6T2-NEXT: movw r0, #65535
; V6T2-NEXT: movt r0, #127
; V6T2-NEXT: cmp r1, r0
; V6T2-NEXT: movlo r0, r1
; V6T2-NEXT: usat r0, #23, r0
; V6T2-NEXT: bx lr
entry:
%0 = call i32 @llvm.smax.i32(i32 %x, i32 0)
Expand All @@ -834,7 +812,7 @@ define i32 @mm_unsigned_sat_upper_lower_3(i32 %x) {
; V4T-NEXT: bic r1, r0, r0, asr #31
; V4T-NEXT: ldr r0, .LCPI22_0
; V4T-NEXT: cmp r1, r0
; V4T-NEXT: movlo r0, r1
; V4T-NEXT: movlt r0, r1
; V4T-NEXT: bx lr
; V4T-NEXT: .p2align 2
; V4T-NEXT: @ %bb.1:
Expand All @@ -843,23 +821,12 @@ define i32 @mm_unsigned_sat_upper_lower_3(i32 %x) {
;
; V6-LABEL: mm_unsigned_sat_upper_lower_3:
; V6: @ %bb.0: @ %entry
; V6-NEXT: bic r1, r0, r0, asr #31
; V6-NEXT: ldr r0, .LCPI22_0
; V6-NEXT: cmp r1, r0
; V6-NEXT: movlo r0, r1
; V6-NEXT: usat r0, #23, r0
; V6-NEXT: bx lr
; V6-NEXT: .p2align 2
; V6-NEXT: @ %bb.1:
; V6-NEXT: .LCPI22_0:
; V6-NEXT: .long 8388607 @ 0x7fffff
;
; V6T2-LABEL: mm_unsigned_sat_upper_lower_3:
; V6T2: @ %bb.0: @ %entry
; V6T2-NEXT: bic r1, r0, r0, asr #31
; V6T2-NEXT: movw r0, #65535
; V6T2-NEXT: movt r0, #127
; V6T2-NEXT: cmp r1, r0
; V6T2-NEXT: movlo r0, r1
; V6T2-NEXT: usat r0, #23, r0
; V6T2-NEXT: bx lr
entry:
%0 = call i32 @llvm.smax.i32(i32 %x, i32 0)
Expand Down Expand Up @@ -913,7 +880,7 @@ define i32 @mm_no_unsigned_sat_incorrect_constant2(i32 %x) {
; V4T-NEXT: mov r0, #1
; V4T-NEXT: orr r0, r0, #8388608
; V4T-NEXT: cmp r1, #8388608
; V4T-NEXT: movls r0, r1
; V4T-NEXT: movle r0, r1
; V4T-NEXT: bx lr
;
; V6-LABEL: mm_no_unsigned_sat_incorrect_constant2:
Expand All @@ -922,7 +889,7 @@ define i32 @mm_no_unsigned_sat_incorrect_constant2(i32 %x) {
; V6-NEXT: mov r0, #1
; V6-NEXT: orr r0, r0, #8388608
; V6-NEXT: cmp r1, #8388608
; V6-NEXT: movls r0, r1
; V6-NEXT: movle r0, r1
; V6-NEXT: bx lr
;
; V6T2-LABEL: mm_no_unsigned_sat_incorrect_constant2:
Expand All @@ -931,7 +898,7 @@ define i32 @mm_no_unsigned_sat_incorrect_constant2(i32 %x) {
; V6T2-NEXT: movw r0, #1
; V6T2-NEXT: movt r0, #128
; V6T2-NEXT: cmp r1, #8388608
; V6T2-NEXT: movls r0, r1
; V6T2-NEXT: movle r0, r1
; V6T2-NEXT: bx lr
entry:
%0 = call i32 @llvm.smax.i32(i32 %x, i32 0)
Expand Down Expand Up @@ -981,6 +948,29 @@ entry:
ret i32 %1
}

define i32 @test_umin_smax_usat(i32 %x) {
; V4T-LABEL: test_umin_smax_usat:
; V4T: @ %bb.0: @ %entry
; V4T-NEXT: bic r0, r0, r0, asr #31
; V4T-NEXT: cmp r0, #255
; V4T-NEXT: movge r0, #255
; V4T-NEXT: bx lr
;
; V6-LABEL: test_umin_smax_usat:
; V6: @ %bb.0: @ %entry
; V6-NEXT: usat r0, #8, r0
; V6-NEXT: bx lr
;
; V6T2-LABEL: test_umin_smax_usat:
; V6T2: @ %bb.0: @ %entry
; V6T2-NEXT: usat r0, #8, r0
; V6T2-NEXT: bx lr
entry:
%v1 = tail call i32 @llvm.smax.i32(i32 %x, i32 0)
%v2 = tail call i32 @llvm.umin.i32(i32 %v1, i32 255)
ret i32 %v2
}

declare i32 @llvm.smin.i32(i32, i32)
declare i32 @llvm.smax.i32(i32, i32)
declare i16 @llvm.smin.i16(i16, i16)
Expand Down
9 changes: 5 additions & 4 deletions llvm/test/CodeGen/RISCV/addimm-mulimm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -551,8 +551,9 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) {
; RV32IMB-NEXT: sh3add a1, a1, a2
; RV32IMB-NEXT: sh1add a0, a0, a0
; RV32IMB-NEXT: slli a2, a0, 3
; RV32IMB-NEXT: addi a0, a2, 2047
; RV32IMB-NEXT: addi a0, a0, 1
; RV32IMB-NEXT: li a3, 1
; RV32IMB-NEXT: slli a3, a3, 11
; RV32IMB-NEXT: sh3add a0, a0, a3
; RV32IMB-NEXT: sltu a2, a0, a2
; RV32IMB-NEXT: add a1, a1, a2
; RV32IMB-NEXT: ret
Expand All @@ -561,8 +562,8 @@ define i64 @add_mul_combine_infinite_loop(i64 %x) {
; RV64IMB: # %bb.0:
; RV64IMB-NEXT: addi a0, a0, 86
; RV64IMB-NEXT: sh1add a0, a0, a0
; RV64IMB-NEXT: li a1, -16
; RV64IMB-NEXT: sh3add a0, a0, a1
; RV64IMB-NEXT: slli a0, a0, 3
; RV64IMB-NEXT: addi a0, a0, -16
; RV64IMB-NEXT: ret
%tmp0 = mul i64 %x, 24
%tmp1 = add i64 %tmp0, 2048
Expand Down
114 changes: 92 additions & 22 deletions llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
Original file line number Diff line number Diff line change
Expand Up @@ -128,43 +128,113 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
; RV32-LABEL: ctz_nxv16i1:
; RV32: # %bb.0:
; RV32-NEXT: vmv1r.v v0, v8
; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; RV32-NEXT: vfirst.m a0, v8
; RV32-NEXT: bgez a0, .LBB2_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 1
; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; RV32-NEXT: vmv.v.x v8, a0
; RV32-NEXT: vid.v v16
; RV32-NEXT: li a1, -1
; RV32-NEXT: vmadd.vx v16, a1, v8
; RV32-NEXT: vmv.v.i v8, 0
; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
; RV32-NEXT: vredmaxu.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: .LBB2_2:
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_nxv16i1:
; RV64: # %bb.0:
; RV64-NEXT: vmv1r.v v0, v8
; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; RV64-NEXT: vfirst.m a0, v8
; RV64-NEXT: bgez a0, .LBB2_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 1
; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma
; RV64-NEXT: vmv.v.x v8, a0
; RV64-NEXT: vid.v v16
; RV64-NEXT: li a1, -1
; RV64-NEXT: vmadd.vx v16, a1, v8
; RV64-NEXT: vmv.v.i v8, 0
; RV64-NEXT: vmerge.vvm v8, v8, v16, v0
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a1, v8
; RV64-NEXT: subw a0, a0, a1
; RV64-NEXT: .LBB2_2:
; RV64-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 0)
ret i32 %res
}

define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
; RV32-LABEL: ctz_nxv16i1_poison:
; RV32: # %bb.0:
; RV32-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; RV32-NEXT: vfirst.m a0, v8
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_nxv16i1_poison:
; RV64: # %bb.0:
; RV64-NEXT: vsetvli a0, zero, e8, m2, ta, ma
; RV64-NEXT: vfirst.m a0, v8
; RV64-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 1)
ret i32 %res
}

define i32 @ctz_v16i1(<16 x i1> %pg, <16 x i1> %a) {
; RV32-LABEL: ctz_v16i1:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV32-NEXT: vfirst.m a0, v8
; RV32-NEXT: bgez a0, .LBB4_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a0, 16
; RV32-NEXT: .LBB4_2:
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_v16i1:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV64-NEXT: vfirst.m a0, v8
; RV64-NEXT: bgez a0, .LBB4_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a0, 16
; RV64-NEXT: .LBB4_2:
; RV64-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
ret i32 %res
}

define i32 @ctz_v16i1_poison(<16 x i1> %pg, <16 x i1> %a) {
; RV32-LABEL: ctz_v16i1_poison:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV32-NEXT: vfirst.m a0, v8
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_v16i1_poison:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV64-NEXT: vfirst.m a0, v8
; RV64-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1)
ret i32 %res
}

define i16 @ctz_v8i1_i16_ret(<8 x i1> %a) {
; RV32-LABEL: ctz_v8i1_i16_ret:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; RV32-NEXT: vfirst.m a0, v0
; RV32-NEXT: bgez a0, .LBB6_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: li a0, 8
; RV32-NEXT: .LBB6_2:
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_v8i1_i16_ret:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
; RV64-NEXT: vfirst.m a0, v0
; RV64-NEXT: bgez a0, .LBB6_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: li a0, 8
; RV64-NEXT: .LBB6_2:
; RV64-NEXT: ret
%res = call i16 @llvm.experimental.cttz.elts.i16.v8i1(<8 x i1> %a, i1 0)
ret i16 %res
}

declare i64 @llvm.experimental.cttz.elts.i64.nxv8i16(<vscale x 8 x i16>, i1)
declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1>, i1)
declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32>, i1)
declare i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1>, i1)
declare i16 @llvm.experimental.cttz.elts.i16.v16i1(<8 x i1>, i1)

attributes #0 = { vscale_range(2,1024) }
102 changes: 26 additions & 76 deletions llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
Original file line number Diff line number Diff line change
@@ -1,71 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=riscv32 < %s | FileCheck %s -check-prefix=RV32
; RUN: llc -mtriple=riscv64 < %s | FileCheck %s -check-prefix=RV64
; RUN: llc -mtriple=riscv32 -mattr=+v < %s | FileCheck %s -check-prefix=RV32
; RUN: llc -mtriple=riscv64 -mattr=+v < %s | FileCheck %s -check-prefix=RV64

; FIXED WIDTH

define i16 @ctz_v4i32(<4 x i32> %a) {
; RV32-LABEL: ctz_v4i32:
; RV32: # %bb.0:
; RV32-NEXT: lw a3, 0(a0)
; RV32-NEXT: lw a1, 4(a0)
; RV32-NEXT: lw a2, 12(a0)
; RV32-NEXT: lw a4, 8(a0)
; RV32-NEXT: seqz a0, a3
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: andi a0, a0, 4
; RV32-NEXT: seqz a3, a4
; RV32-NEXT: addi a3, a3, -1
; RV32-NEXT: andi a3, a3, 2
; RV32-NEXT: bltu a3, a0, .LBB0_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: mv a0, a3
; RV32-NEXT: .LBB0_2:
; RV32-NEXT: snez a2, a2
; RV32-NEXT: seqz a1, a1
; RV32-NEXT: addi a1, a1, -1
; RV32-NEXT: andi a1, a1, 3
; RV32-NEXT: bltu a2, a1, .LBB0_4
; RV32-NEXT: # %bb.3:
; RV32-NEXT: mv a1, a2
; RV32-NEXT: .LBB0_4:
; RV32-NEXT: bltu a1, a0, .LBB0_6
; RV32-NEXT: # %bb.5:
; RV32-NEXT: mv a0, a1
; RV32-NEXT: .LBB0_6:
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV32-NEXT: vmsne.vi v0, v8, 0
; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; RV32-NEXT: vmv.v.i v8, 0
; RV32-NEXT: vmerge.vim v8, v8, -1, v0
; RV32-NEXT: vid.v v9
; RV32-NEXT: vrsub.vi v9, v9, 4
; RV32-NEXT: vand.vv v8, v8, v9
; RV32-NEXT: vredmaxu.vs v8, v8, v8
; RV32-NEXT: vmv.x.s a0, v8
; RV32-NEXT: li a1, 4
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: andi a0, a1, 255
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_v4i32:
; RV64: # %bb.0:
; RV64-NEXT: lw a3, 0(a0)
; RV64-NEXT: lw a1, 8(a0)
; RV64-NEXT: lw a2, 24(a0)
; RV64-NEXT: lw a4, 16(a0)
; RV64-NEXT: seqz a0, a3
; RV64-NEXT: addi a0, a0, -1
; RV64-NEXT: andi a0, a0, 4
; RV64-NEXT: seqz a3, a4
; RV64-NEXT: addi a3, a3, -1
; RV64-NEXT: andi a3, a3, 2
; RV64-NEXT: bltu a3, a0, .LBB0_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: mv a0, a3
; RV64-NEXT: .LBB0_2:
; RV64-NEXT: snez a2, a2
; RV64-NEXT: seqz a1, a1
; RV64-NEXT: addi a1, a1, -1
; RV64-NEXT: andi a1, a1, 3
; RV64-NEXT: bltu a2, a1, .LBB0_4
; RV64-NEXT: # %bb.3:
; RV64-NEXT: mv a1, a2
; RV64-NEXT: .LBB0_4:
; RV64-NEXT: bltu a1, a0, .LBB0_6
; RV64-NEXT: # %bb.5:
; RV64-NEXT: mv a0, a1
; RV64-NEXT: .LBB0_6:
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; RV64-NEXT: vmsne.vi v0, v8, 0
; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
; RV64-NEXT: vmv.v.i v8, 0
; RV64-NEXT: vmerge.vim v8, v8, -1, v0
; RV64-NEXT: vid.v v9
; RV64-NEXT: vrsub.vi v9, v9, 4
; RV64-NEXT: vand.vv v8, v8, v9
; RV64-NEXT: vredmaxu.vs v8, v8, v8
; RV64-NEXT: vmv.x.s a0, v8
; RV64-NEXT: li a1, 4
; RV64-NEXT: subw a1, a1, a0
; RV64-NEXT: andi a0, a1, 255
Expand All @@ -79,32 +47,14 @@ define i16 @ctz_v4i32(<4 x i32> %a) {
define i32 @ctz_v2i1_poison(<2 x i1> %a) {
; RV32-LABEL: ctz_v2i1_poison:
; RV32: # %bb.0:
; RV32-NEXT: andi a1, a1, 1
; RV32-NEXT: slli a0, a0, 31
; RV32-NEXT: srai a0, a0, 31
; RV32-NEXT: andi a0, a0, 2
; RV32-NEXT: bltu a1, a0, .LBB1_2
; RV32-NEXT: # %bb.1:
; RV32-NEXT: mv a0, a1
; RV32-NEXT: .LBB1_2:
; RV32-NEXT: li a1, 2
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: andi a0, a1, 255
; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
; RV32-NEXT: vfirst.m a0, v0
; RV32-NEXT: ret
;
; RV64-LABEL: ctz_v2i1_poison:
; RV64: # %bb.0:
; RV64-NEXT: andi a1, a1, 1
; RV64-NEXT: slli a0, a0, 63
; RV64-NEXT: srai a0, a0, 63
; RV64-NEXT: andi a0, a0, 2
; RV64-NEXT: bltu a1, a0, .LBB1_2
; RV64-NEXT: # %bb.1:
; RV64-NEXT: mv a0, a1
; RV64-NEXT: .LBB1_2:
; RV64-NEXT: li a1, 2
; RV64-NEXT: subw a1, a1, a0
; RV64-NEXT: andi a0, a1, 255
; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
; RV64-NEXT: vfirst.m a0, v0
; RV64-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
ret i32 %res
Expand Down
48 changes: 33 additions & 15 deletions llvm/test/CodeGen/RISCV/rv32zba.ll
Original file line number Diff line number Diff line change
Expand Up @@ -271,31 +271,49 @@ define i32 @mul288(i32 %a) {
}

define i32 @mul258(i32 %a) {
; CHECK-LABEL: mul258:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 258
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: ret
; RV32I-LABEL: mul258:
; RV32I: # %bb.0:
; RV32I-NEXT: li a1, 258
; RV32I-NEXT: mul a0, a0, a1
; RV32I-NEXT: ret
;
; RV32ZBA-LABEL: mul258:
; RV32ZBA: # %bb.0:
; RV32ZBA-NEXT: slli a1, a0, 8
; RV32ZBA-NEXT: sh1add a0, a0, a1
; RV32ZBA-NEXT: ret
%c = mul i32 %a, 258
ret i32 %c
}

define i32 @mul260(i32 %a) {
; CHECK-LABEL: mul260:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 260
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: ret
; RV32I-LABEL: mul260:
; RV32I: # %bb.0:
; RV32I-NEXT: li a1, 260
; RV32I-NEXT: mul a0, a0, a1
; RV32I-NEXT: ret
;
; RV32ZBA-LABEL: mul260:
; RV32ZBA: # %bb.0:
; RV32ZBA-NEXT: slli a1, a0, 8
; RV32ZBA-NEXT: sh2add a0, a0, a1
; RV32ZBA-NEXT: ret
%c = mul i32 %a, 260
ret i32 %c
}

define i32 @mul264(i32 %a) {
; CHECK-LABEL: mul264:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 264
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: ret
; RV32I-LABEL: mul264:
; RV32I: # %bb.0:
; RV32I-NEXT: li a1, 264
; RV32I-NEXT: mul a0, a0, a1
; RV32I-NEXT: ret
;
; RV32ZBA-LABEL: mul264:
; RV32ZBA: # %bb.0:
; RV32ZBA-NEXT: slli a1, a0, 8
; RV32ZBA-NEXT: sh3add a0, a0, a1
; RV32ZBA-NEXT: ret
%c = mul i32 %a, 264
ret i32 %c
}
Expand Down
48 changes: 33 additions & 15 deletions llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
Original file line number Diff line number Diff line change
Expand Up @@ -811,31 +811,49 @@ define i64 @adduw_imm(i32 signext %0) nounwind {
}

define i64 @mul258(i64 %a) {
; CHECK-LABEL: mul258:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 258
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: ret
; RV64I-LABEL: mul258:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, 258
; RV64I-NEXT: mul a0, a0, a1
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: mul258:
; RV64ZBA: # %bb.0:
; RV64ZBA-NEXT: slli a1, a0, 8
; RV64ZBA-NEXT: sh1add a0, a0, a1
; RV64ZBA-NEXT: ret
%c = mul i64 %a, 258
ret i64 %c
}

define i64 @mul260(i64 %a) {
; CHECK-LABEL: mul260:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 260
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: ret
; RV64I-LABEL: mul260:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, 260
; RV64I-NEXT: mul a0, a0, a1
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: mul260:
; RV64ZBA: # %bb.0:
; RV64ZBA-NEXT: slli a1, a0, 8
; RV64ZBA-NEXT: sh2add a0, a0, a1
; RV64ZBA-NEXT: ret
%c = mul i64 %a, 260
ret i64 %c
}

define i64 @mul264(i64 %a) {
; CHECK-LABEL: mul264:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 264
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: ret
; RV64I-LABEL: mul264:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, 264
; RV64I-NEXT: mul a0, a0, a1
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: mul264:
; RV64ZBA: # %bb.0:
; RV64ZBA-NEXT: slli a1, a0, 8
; RV64ZBA-NEXT: sh3add a0, a0, a1
; RV64ZBA-NEXT: ret
%c = mul i64 %a, 264
ret i64 %c
}
Expand Down
71 changes: 56 additions & 15 deletions llvm/test/CodeGen/RISCV/rv64zba.ll
Original file line number Diff line number Diff line change
Expand Up @@ -834,31 +834,49 @@ define i64 @adduw_imm(i32 signext %0) nounwind {
}

define i64 @mul258(i64 %a) {
; CHECK-LABEL: mul258:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 258
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: ret
; RV64I-LABEL: mul258:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, 258
; RV64I-NEXT: mul a0, a0, a1
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: mul258:
; RV64ZBA: # %bb.0:
; RV64ZBA-NEXT: slli a1, a0, 8
; RV64ZBA-NEXT: sh1add a0, a0, a1
; RV64ZBA-NEXT: ret
%c = mul i64 %a, 258
ret i64 %c
}

define i64 @mul260(i64 %a) {
; CHECK-LABEL: mul260:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 260
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: ret
; RV64I-LABEL: mul260:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, 260
; RV64I-NEXT: mul a0, a0, a1
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: mul260:
; RV64ZBA: # %bb.0:
; RV64ZBA-NEXT: slli a1, a0, 8
; RV64ZBA-NEXT: sh2add a0, a0, a1
; RV64ZBA-NEXT: ret
%c = mul i64 %a, 260
ret i64 %c
}

define i64 @mul264(i64 %a) {
; CHECK-LABEL: mul264:
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 264
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: ret
; RV64I-LABEL: mul264:
; RV64I: # %bb.0:
; RV64I-NEXT: li a1, 264
; RV64I-NEXT: mul a0, a0, a1
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: mul264:
; RV64ZBA: # %bb.0:
; RV64ZBA-NEXT: slli a1, a0, 8
; RV64ZBA-NEXT: sh3add a0, a0, a1
; RV64ZBA-NEXT: ret
%c = mul i64 %a, 264
ret i64 %c
}
Expand Down Expand Up @@ -2389,3 +2407,26 @@ define i64 @array_index_sh4_sh3(ptr %p, i64 %idx1, i64 %idx2) {
%b = load i64, ptr %a, align 8
ret i64 %b
}

define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) {
; RV64I-LABEL: test_gep_gep_dont_crash:
; RV64I: # %bb.0:
; RV64I-NEXT: srliw a2, a2, 6
; RV64I-NEXT: slli a2, a2, 3
; RV64I-NEXT: slli a1, a1, 3
; RV64I-NEXT: add a0, a0, a1
; RV64I-NEXT: add a0, a0, a2
; RV64I-NEXT: ret
;
; RV64ZBA-LABEL: test_gep_gep_dont_crash:
; RV64ZBA: # %bb.0:
; RV64ZBA-NEXT: srliw a2, a2, 6
; RV64ZBA-NEXT: add a1, a2, a1
; RV64ZBA-NEXT: sh3add a0, a1, a0
; RV64ZBA-NEXT: ret
%lshr = lshr i64 %a2, 6
%and = and i64 %lshr, 67108863
%gep1 = getelementptr i64, ptr %p, i64 %and
%gep2 = getelementptr i64, ptr %gep1, i64 %a1
ret ptr %gep2
}
19 changes: 19 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/pr88799.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=riscv64-unknown-linux-gnu -mattr=+v | FileCheck %s

define i32 @main() vscale_range(2,2) {
; CHECK-LABEL: main:
; CHECK: # %bb.0: # %vector.body
; CHECK-NEXT: lui a0, 1040368
; CHECK-NEXT: addiw a0, a0, -144
; CHECK-NEXT: vl2re16.v v8, (a0)
; CHECK-NEXT: vs2r.v v8, (zero)
; CHECK-NEXT: li a0, 0
; CHECK-NEXT: ret
vector.body:
%0 = load <16 x i16>, ptr getelementptr ([3 x [23 x [23 x i16]]], ptr null, i64 -10593, i64 1, i64 22, i64 0), align 16
store <16 x i16> %0, ptr null, align 2
%wide.load = load <vscale x 8 x i16>, ptr getelementptr ([3 x [23 x [23 x i16]]], ptr null, i64 -10593, i64 1, i64 22, i64 0), align 16
store <vscale x 8 x i16> %wide.load, ptr null, align 2
ret i32 0
}
42 changes: 22 additions & 20 deletions llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v
;
; AVX512F-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
Expand All @@ -324,8 +324,8 @@ define void @vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2(ptr %in.v
;
; AVX512DQ-LABEL: vec64_i16_widen_to_i32_factor2_broadcast_to_v2i32_factor2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,10,11,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
Expand Down Expand Up @@ -981,7 +981,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpermd %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
Expand All @@ -992,7 +992,7 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
Expand Down Expand Up @@ -3507,12 +3507,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
;
; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm2
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
Expand All @@ -3522,12 +3523,13 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
Expand Down Expand Up @@ -3766,10 +3768,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512F-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0
Expand All @@ -3782,10 +3784,10 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %ymm0
Expand Down Expand Up @@ -4145,9 +4147,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
;
; AVX512F-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512F-NEXT: vpbroadcastw %xmm0, %xmm0
Expand All @@ -4159,9 +4161,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
;
; AVX512DQ-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpbroadcastw %xmm0, %xmm0
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/dpbusd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/dpbusd_i4.ll
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ define i32 @mul_sext_i4i4(<16 x i4> %a, <16 x i4> %b, i32 %c) {
; CHECK-NEXT: vpsraw $12, %ymm0, %ymm0
; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
Expand Down
Loading