19 changes: 11 additions & 8 deletions llvm/lib/Target/ARM/ARMCallingConv.td
Original file line number Diff line number Diff line change
Expand Up @@ -301,14 +301,17 @@ def CSR_ATPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush,
def CSR_ATPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush,
R10)>;

// When enforcing an AAPCS compliant frame chain, R11 is used as the frame
// pointer even for Thumb targets, where split pushes are necessary.
// This AAPCS alternative makes sure the frame index slots match the push
// order in that case.
def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R11,
R7, R6, R5, R4,
R10, R9, R8,
(sequence "D%u", 15, 8))>;
// Sometimes we need to split the push of the callee-saved GPRs into two
// regions, to ensure that the frame chain record is set up correctly. These
// list the callee-saved registers in the order they end up on the stack, which
// depends on whether the frame pointer is r7 or r11.
def CSR_AAPCS_SplitPush_R11 : CalleeSavedRegs<(add R10, R9, R8, R7, R6, R5, R4,
LR, R11,
(sequence "D%u", 15, 8))>;
def CSR_AAPCS_SplitPush_R7 : CalleeSavedRegs<(add LR, R11,
R7, R6, R5, R4,
R10, R9, R8,
(sequence "D%u", 15, 8))>;

// Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
// and the pointer return value are both passed in R0 in these cases, this can
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/ARM/ARMFeatures.td
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,13 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
"AvoidCPSRPartialUpdate", "true",
"Avoid CPSR partial update for OOO execution">;

/// FeatureAvoidMULS - If true, codegen would avoid using the MULS instruction,
/// prefering the thumb2 MUL which doesn't set flags.
def FeatureAvoidMULS : SubtargetFeature<"avoid-muls",
"AvoidMULS", "true",
"Avoid MULS instructions for M class cores">;


/// Disable +1 predication cost for instructions updating CPSR.
/// Enabled for Cortex-A57.
/// True if disable +1 predication cost for instructions updating CPSR. Enabled for Cortex-A57.
Expand Down
154 changes: 104 additions & 50 deletions llvm/lib/Target/ARM/ARMFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,11 @@ SpillArea getSpillArea(Register Reg,
// push {r0-r10, r12} GPRCS1
// vpush {r8-d15} DPRCS1
// push {r11, lr} GPRCS2
//
// SplitR11AAPCSSignRA:
// push {r0-r10, r12} GPRSC1
// push {r11, lr} GPRCS2
// vpush {r8-d15} DPRCS1

// If FPCXTNS is spilled (for CMSE secure entryfunctions), it is always at
// the top of the stack frame.
Expand Down Expand Up @@ -246,7 +251,8 @@ SpillArea getSpillArea(Register Reg,
return SpillArea::GPRCS1;

case ARM::LR:
if (Variation == ARMSubtarget::SplitR11WindowsSEH)
if (Variation == ARMSubtarget::SplitR11WindowsSEH ||
Variation == ARMSubtarget::SplitR11AAPCSSignRA)
return SpillArea::GPRCS2;
else
return SpillArea::GPRCS1;
Expand Down Expand Up @@ -324,6 +330,10 @@ bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
const MachineFrameInfo &MFI = MF.getFrameInfo();

// Check to see if the target want to forcibly keep frame pointer.
if (keepFramePointer(MF))
return true;

// ABI-required frame pointer.
if (MF.getTarget().Options.DisableFramePointerElim(MF))
return true;
Expand Down Expand Up @@ -859,6 +869,9 @@ static int getMaxFPOffset(const ARMSubtarget &STI, const ARMFunctionInfo &AFI,
// This is a conservative estimation: Assume the frame pointer being r7 and
// pc("r15") up to r8 getting spilled before (= 8 registers).
int MaxRegBytes = 8 * 4;
if (PushPopSplit == ARMSubtarget::SplitR11AAPCSSignRA)
// Here, r11 can be stored below all of r4-r15.
MaxRegBytes = 11 * 4;
if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
// Here, r11 can be stored below all of r4-r15 plus d8-d15.
MaxRegBytes = 11 * 4 + 8 * 8;
Expand Down Expand Up @@ -931,17 +944,23 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
}

// Determine spill area sizes, and some important frame indices.
SpillArea FramePtrSpillArea;
bool BeforeFPPush = true;
for (const CalleeSavedInfo &I : CSI) {
Register Reg = I.getReg();
int FI = I.getFrameIdx();

if (Reg == FramePtr)
SpillArea Area = getSpillArea(Reg, PushPopSplit,
AFI->getNumAlignedDPRCS2Regs(), RegInfo);

if (Reg == FramePtr) {
FramePtrSpillFI = FI;
FramePtrSpillArea = Area;
}
if (Reg == ARM::D8)
D8SpillFI = FI;

switch (getSpillArea(Reg, PushPopSplit, AFI->getNumAlignedDPRCS2Regs(),
RegInfo)) {
switch (Area) {
case SpillArea::FPCXT:
FPCXTSaveSize += 4;
break;
Expand All @@ -968,21 +987,23 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
// Move past FPCXT area.
if (FPCXTSaveSize > 0) {
LastPush = MBBI++;
DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, true);
DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, BeforeFPPush);
}

// Allocate the vararg register save area.
if (ArgRegsSaveSize) {
emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize,
MachineInstr::FrameSetup);
LastPush = std::prev(MBBI);
DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, true);
DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, BeforeFPPush);
}

// Move past area 1.
if (GPRCS1Size > 0) {
GPRCS1Push = LastPush = MBBI++;
DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true);
DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, BeforeFPPush);
if (FramePtrSpillArea == SpillArea::GPRCS1)
BeforeFPPush = false;
}

// Determine starting offsets of spill areas. These offsets are all positive
Expand All @@ -1006,21 +1027,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
} else {
DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
}
int FramePtrOffsetInPush = 0;
if (HasFP) {
// Offset from the CFA to the saved frame pointer, will be negative.
int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
[[maybe_unused]] int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
LLVM_DEBUG(dbgs() << "FramePtrSpillFI: " << FramePtrSpillFI
<< ", FPOffset: " << FPOffset << "\n");
assert(getMaxFPOffset(STI, *AFI, MF) <= FPOffset &&
"Max FP estimation is wrong");
// Offset from the top of the GPRCS1 area to the saved frame pointer, will
// be negative.
FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize + FPCXTSaveSize;
LLVM_DEBUG(dbgs() << "FramePtrOffsetInPush=" << FramePtrOffsetInPush
<< ", FramePtrSpillOffset="
<< (MFI.getObjectOffset(FramePtrSpillFI) + NumBytes)
<< "\n");
AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
NumBytes);
}
Expand All @@ -1032,7 +1045,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
// after DPRCS1.
if (GPRCS2Size > 0 && PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) {
GPRCS2Push = LastPush = MBBI++;
DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size);
DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush);
if (FramePtrSpillArea == SpillArea::GPRCS2)
BeforeFPPush = false;
}

// Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our
Expand All @@ -1045,7 +1060,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
else {
emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize,
MachineInstr::FrameSetup);
DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize);
DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize, BeforeFPPush);
}
}

Expand All @@ -1054,7 +1069,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
// Since vpush register list cannot have gaps, there may be multiple vpush
// instructions in the prologue.
while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::VSTMDDB_UPD) {
DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI));
DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI),
BeforeFPPush);
LastPush = MBBI++;
}
}
Expand All @@ -1073,7 +1089,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
// Move GPRCS2, if using using SplitR11WindowsSEH.
if (GPRCS2Size > 0 && PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
GPRCS2Push = LastPush = MBBI++;
DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size);
DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush);
if (FramePtrSpillArea == SpillArea::GPRCS2)
BeforeFPPush = false;
}

bool NeedsWinCFIStackAlloc = NeedsWinCFI;
Expand Down Expand Up @@ -1174,28 +1192,51 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
// into spill area 1, including the FP in R11. In either case, it
// is in area one and the adjustment needs to take place just after
// that push.
// FIXME: The above is not necessary true when PACBTI is enabled.
// AAPCS requires use of R11, and PACBTI gets in the way of regular pushes,
// so FP ends up on area two.
MachineBasicBlock::iterator AfterPush;
if (HasFP) {
AfterPush = std::next(GPRCS1Push);
unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push);
int FPOffset = PushSize + FramePtrOffsetInPush;
if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
AfterPush = std::next(GPRCS2Push);
emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII,
FramePtr, ARM::SP, 0, MachineInstr::FrameSetup);
} else {
emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII,
FramePtr, ARM::SP, FPOffset,
MachineInstr::FrameSetup);
MachineBasicBlock::iterator FPPushInst;
// Offset from SP immediately after the push which saved the FP to the FP
// save slot.
int64_t FPOffsetAfterPush;
switch (FramePtrSpillArea) {
case SpillArea::GPRCS1:
FPPushInst = GPRCS1Push;
FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) +
ArgRegsSaveSize + FPCXTSaveSize +
sizeOfSPAdjustment(*FPPushInst);
LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS1, offset "
<< FPOffsetAfterPush << " after that push\n");
break;
case SpillArea::GPRCS2:
FPPushInst = GPRCS2Push;
FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) +
ArgRegsSaveSize + FPCXTSaveSize + GPRCS1Size +
sizeOfSPAdjustment(*FPPushInst);
if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH)
FPOffsetAfterPush += DPRCSSize + DPRGapSize;
LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS2, offset "
<< FPOffsetAfterPush << " after that push\n");
break;
default:
llvm_unreachable("frame pointer in unknown spill area");
break;
}
AfterPush = std::next(FPPushInst);
if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH)
assert(FPOffsetAfterPush == 0);

// Emit the MOV or ADD to set up the frame pointer register.
emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII,
FramePtr, ARM::SP, FPOffsetAfterPush,
MachineInstr::FrameSetup);

if (!NeedsWinCFI) {
if (FramePtrOffsetInPush + PushSize != 0) {
// Emit DWARF info to find the CFA using the frame pointer from this
// point onward.
if (FPOffsetAfterPush != 0) {
unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
nullptr, MRI->getDwarfRegNum(FramePtr, true),
FPCXTSaveSize + ArgRegsSaveSize - FramePtrOffsetInPush));
-MFI.getObjectOffset(FramePtrSpillFI)));
BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
Expand Down Expand Up @@ -1708,7 +1749,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
!isCmseEntry && !isTrap && AFI->getArgumentStackToRestore() == 0 &&
STI.hasV5TOps() && MBB.succ_empty() && !hasPAC &&
PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) {
(PushPopSplit != ARMSubtarget::SplitR11WindowsSEH &&
PushPopSplit != ARMSubtarget::SplitR11AAPCSSignRA)) {
Reg = ARM::PC;
// Fold the return instruction into the LDM.
DeleteRet = true;
Expand Down Expand Up @@ -2365,7 +2407,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
// to take advantage the eliminateFrameIndex machinery. This also ensures it
// is spilled in the order specified by getCalleeSavedRegs() to make it easier
// to combine multiple loads / stores.
bool CanEliminateFrame = !(requiresAAPCSFrameRecord(MF) && hasFP(MF));
bool CanEliminateFrame = !(requiresAAPCSFrameRecord(MF) && hasFP(MF)) &&
!MF.getTarget().Options.DisableFramePointerElim(MF);
bool CS1Spilled = false;
bool LRSpilled = false;
unsigned NumGPRSpills = 0;
Expand Down Expand Up @@ -2940,18 +2983,29 @@ bool ARMFrameLowering::assignCalleeSavedSpillSlots(
const auto &AFI = *MF.getInfo<ARMFunctionInfo>();
if (AFI.shouldSignReturnAddress()) {
// The order of register must match the order we push them, because the
// PEI assigns frame indices in that order. When compiling for return
// address sign and authenication, we use split push, therefore the orders
// we want are:
// LR, R7, R6, R5, R4, <R12>, R11, R10, R9, R8, D15-D8
CSI.insert(find_if(CSI,
[=](const auto &CS) {
Register Reg = CS.getReg();
return Reg == ARM::R10 || Reg == ARM::R11 ||
Reg == ARM::R8 || Reg == ARM::R9 ||
ARM::DPRRegClass.contains(Reg);
}),
CalleeSavedInfo(ARM::R12));
// PEI assigns frame indices in that order. That order depends on the
// PushPopSplitVariation, there are only two cases which we use with return
// address signing:
switch (STI.getPushPopSplitVariation(MF)) {
case ARMSubtarget::SplitR7:
// LR, R7, R6, R5, R4, <R12>, R11, R10, R9, R8, D15-D8
CSI.insert(find_if(CSI,
[=](const auto &CS) {
Register Reg = CS.getReg();
return Reg == ARM::R10 || Reg == ARM::R11 ||
Reg == ARM::R8 || Reg == ARM::R9 ||
ARM::DPRRegClass.contains(Reg);
}),
CalleeSavedInfo(ARM::R12));
break;
case ARMSubtarget::SplitR11AAPCSSignRA:
// With SplitR11AAPCSSignRA, R12 will always be the highest-addressed CSR
// on the stack.
CSI.insert(CSI.begin(), CalleeSavedInfo(ARM::R12));
break;
default:
llvm_unreachable("Unexpected CSR split with return address signing");
}
}

return false;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/ARM/ARMFrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class ARMFrameLowering : public TargetFrameLowering {
MutableArrayRef<CalleeSavedInfo> CSI,
const TargetRegisterInfo *TRI) const override;

bool keepFramePointer(const MachineFunction &MF) const override;
bool keepFramePointer(const MachineFunction &MF) const;

bool enableCalleeSaveSkip(const MachineFunction &MF) const override;

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/ARM/ARMProcessors.td
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model, [ARMv8mMainline,
FeatureHasSlowFPVFMx,
FeatureUseMISched,
FeatureHasNoBranchPredictor,
FeatureAvoidMULS,
FeatureFixCMSE_CVE_2021_35465]>;

def : ProcessorModel<"star-mc1", CortexM4Model, [ARMv8mMainline,
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/ARM/ARMRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,9 @@ def FPEXC : ARMReg<8, "fpexc">;
def FPINST : ARMReg<9, "fpinst">;
def FPINST2 : ARMReg<10, "fpinst2">;
// These encodings aren't actual instruction encodings, their encoding depends
// on the instruction they are used in and for VPR 32 was chosen such that it
// on the instruction they are used in and for VPR 64 was chosen such that it
// always comes last in spr_reglist_with_vpr.
def VPR : ARMReg<32, "vpr">;
def VPR : ARMReg<64, "vpr">;
def FPSCR_NZCVQC
: ARMReg<2, "fpscr_nzcvqc">;
def P0 : ARMReg<13, "p0">;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/ARM/ARMSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -514,5 +514,12 @@ ARMSubtarget::getPushPopSplitVariation(const MachineFunction &MF) const {
F.needsUnwindTableEntry() &&
(MFI.hasVarSizedObjects() || getRegisterInfo()->hasStackRealignment(MF)))
return SplitR11WindowsSEH;

// Returns R11SplitAAPCSBranchSigning if R11 and lr are not adjacent to each
// other in the list of callee saved registers in a frame, and branch
// signing is enabled.
if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress() &&
getFramePointerReg() == ARM::R11)
return SplitR11AAPCSSignRA;
return NoSplit;
}
12 changes: 12 additions & 0 deletions llvm/lib/Target/ARM/ARMSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,18 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
/// vpush {d8-d15}
/// push {r11, lr}
SplitR11WindowsSEH,

/// When generating AAPCS-compilant frame chains, R11 is the frame pointer,
/// and must be pushed adjacent to the return address (LR). Normally this
/// isn't a problem, because the only register between them is r12, which is
/// the intra-procedure-call scratch register, so doesn't need to be saved.
/// However, when PACBTI is in use, r12 contains the authentication code, so
/// does need to be saved. This means that we need a separate push for R11
/// and LR.
/// push {r0-r10, r12}
/// push {r11, lr}
/// vpush {d8-d15}
SplitR11AAPCSSignRA,
};

protected:
Expand Down
54 changes: 41 additions & 13 deletions llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,8 @@ class ARMAsmParser : public MCTargetAsmParser {
int tryParseShiftRegister(OperandVector &);
std::optional<ARM_AM::ShiftOpc> tryParseShiftToken();
bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
bool AllowRAAC = false,
bool AllowOutOfBoundReg = false);
bool AllowRAAC = false, bool IsLazyLoadStore = false,
bool IsVSCCLRM = false);
bool parseMemory(OperandVector &);
bool parseOperand(OperandVector &, StringRef Mnemonic);
bool parseImmExpr(int64_t &Out);
Expand Down Expand Up @@ -1158,7 +1158,8 @@ class ARMOperand : public MCParsedAsmOperand {
bool isFPImm() const {
if (!isImm()) return false;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
if (!CE) return false;
if (!CE || !isUInt<32>(CE->getValue()))
return false;
int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
return Val != -1;
}
Expand Down Expand Up @@ -3810,6 +3811,10 @@ class ARMOperand : public MCParsedAsmOperand {
Kind = k_FPSRegisterListWithVPR;
else
Kind = k_SPRRegisterList;
} else if (Regs.front().second == ARM::VPR) {
assert(Regs.size() == 1 &&
"Register list starting with VPR expected to only contain VPR");
Kind = k_FPSRegisterListWithVPR;
}

if (Kind == k_RegisterList && Regs.back().second == ARM::APSR)
Expand Down Expand Up @@ -4607,7 +4612,8 @@ insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, MCRegister>> &Regs,

/// Parse a register list.
bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
bool AllowRAAC, bool AllowOutOfBoundReg) {
bool AllowRAAC, bool IsLazyLoadStore,
bool IsVSCCLRM) {
MCAsmParser &Parser = getParser();
if (Parser.getTok().isNot(AsmToken::LCurly))
return TokError("Token is not a Left Curly Brace");
Expand All @@ -4617,15 +4623,23 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,

// Check the first register in the list to see what register class
// this is a list of.
MCRegister Reg = tryParseRegister();
bool AllowOutOfBoundReg = IsLazyLoadStore || IsVSCCLRM;
MCRegister Reg = tryParseRegister(AllowOutOfBoundReg);
if (!Reg)
return Error(RegLoc, "register expected");
if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
return Error(RegLoc, "pseudo-register not allowed");
// The reglist instructions have at most 16 registers, so reserve
// The reglist instructions have at most 32 registers, so reserve
// space for that many.
int EReg = 0;
SmallVector<std::pair<unsigned, MCRegister>, 16> Registers;
SmallVector<std::pair<unsigned, MCRegister>, 32> Registers;

// Single-precision VSCCLRM can have double-precision registers in the
// register list. When VSCCLRMAdjustEncoding is true then we've switched from
// single-precision to double-precision and we pretend that these registers
// are encoded as S32 onwards, which we can do by adding 16 to the encoding
// value.
bool VSCCLRMAdjustEncoding = false;

// Allow Q regs and just interpret them as the two D sub-registers.
if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
Expand All @@ -4644,6 +4658,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
RC = &ARMMCRegisterClasses[ARM::SPRRegClassID];
else if (ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg))
RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID];
else if (Reg == ARM::VPR)
RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID];
else
return Error(RegLoc, "invalid register in register list");

Expand Down Expand Up @@ -4684,6 +4700,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
while (Reg != EndReg) {
Reg = getNextRegister(Reg);
EReg = MRI->getEncodingValue(Reg);
if (VSCCLRMAdjustEncoding)
EReg += 16;
if (!insertNoDuplicates(Registers, EReg, Reg)) {
Warning(AfterMinusLoc, StringRef("duplicated register (") +
ARMInstPrinter::getRegisterName(Reg) +
Expand All @@ -4695,6 +4713,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
Parser.Lex(); // Eat the comma.
RegLoc = Parser.getTok().getLoc();
MCRegister OldReg = Reg;
int EOldReg = EReg;
const AsmToken RegTok = Parser.getTok();
Reg = tryParseRegister(AllowOutOfBoundReg);
if (!Reg)
Expand Down Expand Up @@ -4726,6 +4745,12 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
}
continue;
}
// VSCCLRM can switch from single-precision to double-precision only when
// S31 is followed by D16.
if (IsVSCCLRM && OldReg == ARM::S31 && Reg == ARM::D16) {
VSCCLRMAdjustEncoding = true;
RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID];
}
// The register must be in the same register class as the first.
if ((Reg == ARM::RA_AUTH_CODE &&
RC != &ARMMCRegisterClasses[ARM::GPRRegClassID]) ||
Expand All @@ -4735,8 +4760,10 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
// exception is CLRM, which is order-independent anyway, so
// there's no potential for confusion if you write clrm {r2,r1}
// instead of clrm {r1,r2}.
if (EnforceOrder &&
MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) {
EReg = MRI->getEncodingValue(Reg);
if (VSCCLRMAdjustEncoding)
EReg += 16;
if (EnforceOrder && EReg < EOldReg) {
if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
Warning(RegLoc, "register list not in ascending order");
else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg))
Expand All @@ -4745,9 +4772,9 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
// VFP register lists must also be contiguous.
if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] &&
Reg != OldReg + 1)
EReg != EOldReg + 1)
return Error(RegLoc, "non-contiguous register range");
EReg = MRI->getEncodingValue(Reg);

if (!insertNoDuplicates(Registers, EReg, Reg)) {
Warning(RegLoc, "duplicated register (" + RegTok.getString() +
") in register list");
Expand Down Expand Up @@ -6335,9 +6362,10 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
case AsmToken::LBrac:
return parseMemory(Operands);
case AsmToken::LCurly: {
bool AllowOutOfBoundReg = Mnemonic == "vlldm" || Mnemonic == "vlstm";
bool IsLazyLoadStore = Mnemonic == "vlldm" || Mnemonic == "vlstm";
bool IsVSCCLRM = Mnemonic == "vscclrm";
return parseRegisterList(Operands, !Mnemonic.starts_with("clr"), false,
AllowOutOfBoundReg);
IsLazyLoadStore, IsVSCCLRM);
}
case AsmToken::Dollar:
case AsmToken::Hash: {
Expand Down
52 changes: 34 additions & 18 deletions llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1529,15 +1529,19 @@ static const uint16_t DPRDecoderTable[] = {
ARM::D28, ARM::D29, ARM::D30, ARM::D31
};

static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const MCDisassembler *Decoder) {
// Does this instruction/subtarget permit use of registers d16-d31?
static bool PermitsD32(const MCInst &Inst, const MCDisassembler *Decoder) {
if (Inst.getOpcode() == ARM::VSCCLRMD || Inst.getOpcode() == ARM::VSCCLRMS)
return true;
const FeatureBitset &featureBits =
((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
return featureBits[ARM::FeatureD32];
}

bool hasD32 = featureBits[ARM::FeatureD32];

if (RegNo > 31 || (!hasD32 && RegNo > 15))
static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address,
const MCDisassembler *Decoder) {
if (RegNo > (PermitsD32(Inst, Decoder) ? 31u : 15u))
return MCDisassembler::Fail;

unsigned Register = DPRDecoderTable[RegNo];
Expand Down Expand Up @@ -1816,10 +1820,11 @@ static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
unsigned regs = fieldFromInstruction(Val, 1, 7);

// In case of unpredictable encoding, tweak the operands.
if (regs == 0 || regs > 16 || (Vd + regs) > 32) {
regs = Vd + regs > 32 ? 32 - Vd : regs;
unsigned MaxReg = PermitsD32(Inst, Decoder) ? 32 : 16;
if (regs == 0 || (Vd + regs) > MaxReg) {
regs = Vd + regs > MaxReg ? MaxReg - Vd : regs;
regs = std::max( 1u, regs);
regs = std::min(16u, regs);
regs = std::min(MaxReg, regs);
S = MCDisassembler::SoftFail;
}

Expand Down Expand Up @@ -6447,20 +6452,31 @@ static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,

Inst.addOperand(MCOperand::createImm(ARMCC::AL));
Inst.addOperand(MCOperand::createReg(0));
if (Inst.getOpcode() == ARM::VSCCLRMD) {
unsigned reglist = (fieldFromInstruction(Insn, 1, 7) << 1) |
(fieldFromInstruction(Insn, 12, 4) << 8) |
unsigned regs = fieldFromInstruction(Insn, 0, 8);
if (regs == 0) {
// Register list contains only VPR
} else if (Inst.getOpcode() == ARM::VSCCLRMD) {
unsigned reglist = regs | (fieldFromInstruction(Insn, 12, 4) << 8) |
(fieldFromInstruction(Insn, 22, 1) << 12);
if (!Check(S, DecodeDPRRegListOperand(Inst, reglist, Address, Decoder))) {
return MCDisassembler::Fail;
}
} else {
unsigned reglist = fieldFromInstruction(Insn, 0, 8) |
(fieldFromInstruction(Insn, 22, 1) << 8) |
(fieldFromInstruction(Insn, 12, 4) << 9);
if (!Check(S, DecodeSPRRegListOperand(Inst, reglist, Address, Decoder))) {
return MCDisassembler::Fail;
}
unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 1) |
fieldFromInstruction(Insn, 22, 1);
// Registers past s31 are permitted and treated as being half of a d
// register, though both halves of each d register must be present.
unsigned max_reg = Vd + regs;
if (max_reg > 64 || (max_reg > 32 && (max_reg & 1)))
S = MCDisassembler::SoftFail;
unsigned max_sreg = std::min(32u, max_reg);
unsigned max_dreg = std::min(32u, max_reg / 2);
for (unsigned i = Vd; i < max_sreg; ++i)
if (!Check(S, DecodeSPRRegisterClass(Inst, i, Address, Decoder)))
return MCDisassembler::Fail;
for (unsigned i = 16; i < max_dreg; ++i)
if (!Check(S, DecodeDPRRegisterClass(Inst, i, Address, Decoder)))
return MCDisassembler::Fail;
}
Inst.addOperand(MCOperand::createReg(ARM::VPR));

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -851,7 +851,7 @@ void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
const MCSubtargetInfo &STI,
raw_ostream &O) {
if (MI->getOpcode() != ARM::t2CLRM) {
if (MI->getOpcode() != ARM::t2CLRM && MI->getOpcode() != ARM::VSCCLRMS) {
assert(is_sorted(drop_begin(*MI, OpNum),
[&](const MCOperand &LHS, const MCOperand &RHS) {
return MRI.getEncodingValue(LHS.getReg()) <
Expand Down
19 changes: 16 additions & 3 deletions llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1743,15 +1743,28 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,

unsigned Binary = 0;

if (SPRRegs || DPRRegs) {
if (SPRRegs || DPRRegs || Reg == ARM::VPR) {
// VLDM/VSTM/VSCCLRM
unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
Binary |= (RegNo & 0x1f) << 8;

// Ignore VPR
if (MI.getOpcode() == ARM::VSCCLRMD || MI.getOpcode() == ARM::VSCCLRMS)
if (MI.getOpcode() == ARM::VSCCLRMD)
// Ignore VPR
--NumRegs;
else if (MI.getOpcode() == ARM::VSCCLRMS) {
// The register list can contain both S registers and D registers, with D
// registers counting as two registers. VPR doesn't count towards the
// number of registers.
NumRegs = 0;
for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
Reg = MI.getOperand(I).getReg();
if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg))
NumRegs += 1;
else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg))
NumRegs += 2;
}
}
if (SPRRegs)
Binary |= NumRegs;
else
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
Register Reg1 = MI->getOperand(1).getReg();
// t2MUL is "special". The tied source operand is second, not first.
if (MI->getOpcode() == ARM::t2MUL) {
// MULS can be slower than MUL
if (!MinimizeSize && STI->avoidMULS())
return false;
Register Reg2 = MI->getOperand(2).getReg();
// Early exit if the regs aren't all low regs.
if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2503,7 +2503,8 @@ APInt HexagonConstEvaluator::getCmpImm(unsigned Opc, unsigned OpX,
}

uint64_t Val = MO.getImm();
return APInt(32, Val, Signed);
// TODO: Is implicitTrunc correct here?
return APInt(32, Val, Signed, /*implicitTrunc=*/true);
}

void HexagonConstEvaluator::replaceWithNop(MachineInstr &MI) {
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ bool HexagonGenExtract::convert(Instruction *In) {
// this value.
if (!LogicalSR && (SR > SL))
return false;
APInt A = APInt(BW, ~0ULL).lshr(SR).shl(SL);
APInt A = APInt(BW, ~0ULL, true).lshr(SR).shl(SL);
CM = ConstantInt::get(Ctx, A);
}

Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3446,7 +3446,9 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
if (!Elt)
continue;
APInt ExpectedVal =
(APInt(EltSizeInBits, Idx) * *SeqStepNum).sdiv(*SeqStepDenom);
(APInt(EltSizeInBits, Idx, /*isSigned=*/false, /*implicitTrunc=*/true) *
*SeqStepNum)
.sdiv(*SeqStepDenom);

APInt Addend = *Elt - ExpectedVal;
if (!SeqAddend)
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -756,8 +756,8 @@ bool RISCVVLOptimizer::checkUsers(const MachineOperand *&CommonVL,
const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);

// Looking for an immediate or a register VL that isn't X0.
assert(!VLOp.isReg() ||
VLOp.getReg() != RISCV::X0 && "Did not expect X0 VL");
assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
"Did not expect X0 VL");

if (!CommonVL) {
CommonVL = &VLOp;
Expand Down
6 changes: 3 additions & 3 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52745,8 +52745,8 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
if (XORRHS.isConstant()) {
APInt ConjugationInt32 = APInt(32, 0x80000000, true);
APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
APInt ConjugationInt32 = APInt(32, 0x80000000);
APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
if ((XORRHS.getBitWidth() == 32 &&
XORRHS.getConstant() == ConjugationInt32) ||
(XORRHS.getBitWidth() == 64 &&
Expand Down Expand Up @@ -52785,7 +52785,7 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
Flags.hasNoSignedZeros();
};
auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
APInt AI = APInt(32, 0x80008000, true);
APInt AI = APInt(32, 0x80008000);
KnownBits Bits = DAG.computeKnownBits(Op);
return Bits.getBitWidth() == 32 && Bits.isConstant() &&
Bits.getConstant() == AI;
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ using OffsetAndArgPart = std::pair<int64_t, ArgPart>;
static Value *createByteGEP(IRBuilderBase &IRB, const DataLayout &DL,
Value *Ptr, Type *ResElemTy, int64_t Offset) {
if (Offset != 0) {
APInt APOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
APInt APOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset,
/*isSigned=*/true);
Ptr = IRB.CreatePtrAdd(Ptr, IRB.getInt(APOffset));
}
return Ptr;
Expand Down
47 changes: 31 additions & 16 deletions llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2722,13 +2722,15 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
}

if (Value *Res =
foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/true, /*IsLogical=*/false))
return replaceInstUsesWith(I, Res);

{
ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
if (LHS && RHS)
if (Value *Res = foldAndOrOfICmps(LHS, RHS, I, /* IsAnd */ true))
return replaceInstUsesWith(I, Res);

// TODO: Base this on foldBooleanAndOr instead?
// TODO: Make this recursive; it's a little tricky because an arbitrary
// number of 'and' instructions might have to be created.
if (LHS && match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
Expand Down Expand Up @@ -2767,11 +2769,6 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
}
}

if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
if (Value *Res = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ true))
return replaceInstUsesWith(I, Res);

if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
return FoldedFCmps;

Expand Down Expand Up @@ -3523,6 +3520,27 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
return foldAndOrOfICmpsUsingRanges(LHS, RHS, IsAnd);
}

/// If IsLogical is true, then the and/or is in select form and the transform
/// must be poison-safe.
Value *InstCombinerImpl::foldBooleanAndOr(Value *LHS, Value *RHS,
Instruction &I, bool IsAnd,
bool IsLogical) {
if (!LHS->getType()->isIntOrIntVectorTy(1))
return nullptr;

if (auto *LHSCmp = dyn_cast<ICmpInst>(LHS))
if (auto *RHSCmp = dyn_cast<ICmpInst>(RHS))
if (Value *Res = foldAndOrOfICmps(LHSCmp, RHSCmp, I, IsAnd, IsLogical))
return Res;

if (auto *LHSCmp = dyn_cast<FCmpInst>(LHS))
if (auto *RHSCmp = dyn_cast<FCmpInst>(RHS))
if (Value *Res = foldLogicOfFCmps(LHSCmp, RHSCmp, IsAnd, IsLogical))
return Res;

return nullptr;
}

static Value *foldOrOfInversions(BinaryOperator &I,
InstCombiner::BuilderTy &Builder) {
assert(I.getOpcode() == Instruction::Or &&
Expand Down Expand Up @@ -3804,13 +3822,15 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
if (SwappedForXor)
std::swap(Op0, Op1);

if (Value *Res =
foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/false, /*IsLogical=*/false))
return replaceInstUsesWith(I, Res);

{
ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
if (LHS && RHS)
if (Value *Res = foldAndOrOfICmps(LHS, RHS, I, /* IsAnd */ false))
return replaceInstUsesWith(I, Res);

// TODO: Base this on foldBooleanAndOr instead?
// TODO: Make this recursive; it's a little tricky because an arbitrary
// number of 'or' instructions might have to be created.
Value *X, *Y;
Expand Down Expand Up @@ -3850,11 +3870,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
}
}

if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
if (Value *Res = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ false))
return replaceInstUsesWith(I, Res);

if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
return FoldedFCmps;

Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6771,11 +6771,15 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
}

// Turn a signed comparison into an unsigned one if both operands are known to
// have the same sign.
if (I.isSigned() &&
// have the same sign. Set samesign if possible (except for equality
// predicates).
if ((I.isSigned() || (I.isUnsigned() && !I.hasSameSign())) &&
((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) ||
(Op0Known.One.isNegative() && Op1Known.One.isNegative())))
return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
(Op0Known.One.isNegative() && Op1Known.One.isNegative()))) {
I.setPredicate(I.getUnsignedPredicate());
I.setSameSign();
return &I;
}

return nullptr;
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineInternal.h
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,9 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
Instruction *foldLogicOfIsFPClass(BinaryOperator &Operator, Value *LHS,
Value *RHS);

Value *foldBooleanAndOr(Value *LHS, Value *RHS, Instruction &I, bool IsAnd,
bool IsLogical);

Instruction *
canonicalizeConditionalNegationViaMathToSelect(BinaryOperator &i);

Expand Down
20 changes: 3 additions & 17 deletions llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3143,12 +3143,6 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal)));
}

if (auto *LHS = dyn_cast<FCmpInst>(CondVal))
if (auto *RHS = dyn_cast<FCmpInst>(FalseVal))
if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ false,
/*IsSelectLogical*/ true))
return replaceInstUsesWith(SI, V);

// (A && B) || (C && B) --> (A || C) && B
if (match(CondVal, m_LogicalAnd(m_Value(A), m_Value(B))) &&
match(FalseVal, m_LogicalAnd(m_Value(C), m_Value(D))) &&
Expand Down Expand Up @@ -3191,12 +3185,6 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal)));
}

if (auto *LHS = dyn_cast<FCmpInst>(CondVal))
if (auto *RHS = dyn_cast<FCmpInst>(TrueVal))
if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ true,
/*IsSelectLogical*/ true))
return replaceInstUsesWith(SI, V);

// (A || B) && (C || B) --> (A && C) || B
if (match(CondVal, m_LogicalOr(m_Value(A), m_Value(B))) &&
match(TrueVal, m_LogicalOr(m_Value(C), m_Value(D))) &&
Expand Down Expand Up @@ -3305,11 +3293,9 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
return replaceInstUsesWith(SI, Op1);
}

if (auto *ICmp0 = dyn_cast<ICmpInst>(CondVal))
if (auto *ICmp1 = dyn_cast<ICmpInst>(Op1))
if (auto *V = foldAndOrOfICmps(ICmp0, ICmp1, SI, IsAnd,
/* IsLogical */ true))
return replaceInstUsesWith(SI, V);
if (auto *V = foldBooleanAndOr(CondVal, Op1, SI, IsAnd,
/*IsLogical=*/true))
return replaceInstUsesWith(SI, V);
}

// select (a || b), c, false -> select a, c, false
Expand Down
63 changes: 60 additions & 3 deletions llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/EHPersonalities.h"
Expand All @@ -28,6 +29,8 @@
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/SpecialCaseList.h"
#include "llvm/Support/VirtualFileSystem.h"
Expand Down Expand Up @@ -82,8 +85,10 @@ const char SanCovCountersSectionName[] = "sancov_cntrs";
const char SanCovBoolFlagSectionName[] = "sancov_bools";
const char SanCovPCsSectionName[] = "sancov_pcs";
const char SanCovCFsSectionName[] = "sancov_cfs";
const char SanCovCallbackGateSectionName[] = "sancov_gate";

const char SanCovLowestStackName[] = "__sancov_lowest_stack";
const char SanCovCallbackGateName[] = "__sancov_should_track";

static cl::opt<int> ClCoverageLevel(
"sanitizer-coverage-level",
Expand Down Expand Up @@ -152,6 +157,12 @@ static cl::opt<bool>
ClCollectCF("sanitizer-coverage-control-flow",
cl::desc("collect control flow for each function"), cl::Hidden);

static cl::opt<bool> ClGatedCallbacks(
"sanitizer-coverage-gated-trace-callbacks",
cl::desc("Gate the invocation of the tracing callbacks on a global "
"variable. Currently only supported for trace-pc-guard."),
cl::Hidden, cl::init(false));

namespace {

SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) {
Expand Down Expand Up @@ -194,6 +205,7 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
Options.StackDepth |= ClStackDepth;
Options.TraceLoads |= ClLoadTracing;
Options.TraceStores |= ClStoreTracing;
Options.GatedCallbacks |= ClGatedCallbacks;
if (!Options.TracePCGuard && !Options.TracePC &&
!Options.Inline8bitCounters && !Options.StackDepth &&
!Options.InlineBoolFlag && !Options.TraceLoads && !Options.TraceStores)
Expand Down Expand Up @@ -239,8 +251,9 @@ class ModuleSanitizerCoverage {
const char *Section);
GlobalVariable *CreatePCArray(Function &F, ArrayRef<BasicBlock *> AllBlocks);
void CreateFunctionLocalArrays(Function &F, ArrayRef<BasicBlock *> AllBlocks);
Value *CreateFunctionLocalGateCmp(IRBuilder<> &IRB);
void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx,
bool IsLeafFunc = true);
Value *&FunctionGateCmp, bool IsLeafFunc = true);
Function *CreateInitCallsForSections(Module &M, const char *CtorName,
const char *InitFunctionName, Type *Ty,
const char *Section);
Expand All @@ -265,6 +278,7 @@ class ModuleSanitizerCoverage {
FunctionCallee SanCovTraceGepFunction;
FunctionCallee SanCovTraceSwitchFunction;
GlobalVariable *SanCovLowestStack;
GlobalVariable *SanCovCallbackGate;
Type *PtrTy, *IntptrTy, *Int64Ty, *Int32Ty, *Int16Ty, *Int8Ty, *Int1Ty;
Module *CurModule;
std::string CurModuleUniqueId;
Expand Down Expand Up @@ -478,6 +492,23 @@ bool ModuleSanitizerCoverage::instrumentModule() {
if (Options.StackDepth && !SanCovLowestStack->isDeclaration())
SanCovLowestStack->setInitializer(Constant::getAllOnesValue(IntptrTy));

if (Options.GatedCallbacks) {
if (!Options.TracePCGuard) {
C->emitError(StringRef("'") + ClGatedCallbacks.ArgStr +
"' is only supported with trace-pc-guard");
return true;
}

SanCovCallbackGate = cast<GlobalVariable>(
M.getOrInsertGlobal(SanCovCallbackGateName, Int64Ty));
SanCovCallbackGate->setSection(
getSectionName(SanCovCallbackGateSectionName));
SanCovCallbackGate->setInitializer(Constant::getNullValue(Int64Ty));
SanCovCallbackGate->setLinkage(GlobalVariable::LinkOnceAnyLinkage);
SanCovCallbackGate->setVisibility(GlobalVariable::HiddenVisibility);
appendToCompilerUsed(M, SanCovCallbackGate);
}

SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy);
SanCovTracePCGuard =
M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, PtrTy);
Expand Down Expand Up @@ -777,13 +808,22 @@ void ModuleSanitizerCoverage::CreateFunctionLocalArrays(
FunctionPCsArray = CreatePCArray(F, AllBlocks);
}

Value *ModuleSanitizerCoverage::CreateFunctionLocalGateCmp(IRBuilder<> &IRB) {
auto Load = IRB.CreateLoad(Int64Ty, SanCovCallbackGate);
Load->setNoSanitizeMetadata();
auto Cmp = IRB.CreateIsNotNull(Load);
Cmp->setName("sancov gate cmp");
return Cmp;
}

bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
ArrayRef<BasicBlock *> AllBlocks,
bool IsLeafFunc) {
if (AllBlocks.empty()) return false;
CreateFunctionLocalArrays(F, AllBlocks);
Value *FunctionGateCmp = nullptr;
for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
InjectCoverageAtBlock(F, *AllBlocks[i], i, IsLeafFunc);
InjectCoverageAtBlock(F, *AllBlocks[i], i, FunctionGateCmp, IsLeafFunc);
return true;
}

Expand Down Expand Up @@ -946,6 +986,7 @@ void ModuleSanitizerCoverage::InjectTraceForCmp(

void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
size_t Idx,
Value *&FunctionGateCmp,
bool IsLeafFunc) {
BasicBlock::iterator IP = BB.getFirstInsertionPt();
bool IsEntryBB = &BB == &F.getEntryBlock();
Expand All @@ -971,7 +1012,23 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
ConstantInt::get(IntptrTy, Idx * 4)),
PtrTy);
IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
if (Options.GatedCallbacks) {
if (!FunctionGateCmp) {
// Create this in the entry block
assert(IsEntryBB);
FunctionGateCmp = CreateFunctionLocalGateCmp(IRB);
}
// Set the branch weights in order to minimize the price paid when the
// gate is turned off, allowing the default enablement of this
// instrumentation with as little of a performance cost as possible
auto Weights = MDBuilder(*C).createBranchWeights(1, 100000);
auto ThenTerm =
SplitBlockAndInsertIfThen(FunctionGateCmp, &*IP, false, Weights);
IRBuilder<> ThenIRB(ThenTerm);
ThenIRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
} else {
IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
}
}
if (Options.Inline8bitCounters) {
auto CounterPtr = IRB.CreateGEP(
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Utils/SimplifyCFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7243,7 +7243,7 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,

for (auto Case : SI->cases()) {
auto *Orig = Case.getCaseValue();
auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base, true);
Case.setValue(cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(Shift))));
}
return true;
Expand Down
15 changes: 5 additions & 10 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -638,9 +638,6 @@ class InnerLoopVectorizer {
/// there can be multiple exiting edges reaching this block.
BasicBlock *LoopExitBlock;

/// The scalar loop body.
BasicBlock *LoopScalarBody;

/// A list of all bypass blocks. The first block is the entry of the loop.
SmallVector<BasicBlock *, 4> LoopBypassBlocks;

Expand Down Expand Up @@ -2530,7 +2527,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
}

void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
LoopScalarBody = OrigLoop->getHeader();
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
assert(LoopVectorPreHeader && "Invalid loop structure");
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
Expand Down Expand Up @@ -2944,20 +2940,19 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,

// Set/update profile weights for the vector and remainder loops as original
// loop iterations are now distributed among them. Note that original loop
// represented by LoopScalarBody becomes remainder loop after vectorization.
// becomes the scalar remainder loop after vectorization.
//
// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
// end up getting slightly roughened result but that should be OK since
// profile is not inherently precise anyway. Note also possible bypass of
// vector code caused by legality checks is ignored, assigning all the weight
// to the vector loop, optimistically.
//
// For scalable vectorization we can't know at compile time how many iterations
// of the loop are handled in one vector iteration, so instead assume a pessimistic
// vscale of '1'.
Loop *ScalarLoop = LI->getLoopFor(LoopScalarBody);
// For scalable vectorization we can't know at compile time how many
// iterations of the loop are handled in one vector iteration, so instead
// assume a pessimistic vscale of '1'.
Loop *VectorLoop = LI->getLoopFor(HeaderBB);
setProfileInfoAfterUnrolling(ScalarLoop, VectorLoop, ScalarLoop,
setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
VF.getKnownMinValue() * UF);
}

Expand Down
21 changes: 10 additions & 11 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19949,7 +19949,7 @@ static bool isReductionCandidate(Instruction *I) {
}

bool SLPVectorizerPass::vectorizeHorReduction(
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
if (!ShouldVectorizeHor)
return false;
Expand Down Expand Up @@ -19982,7 +19982,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
Stack.emplace(SelectRoot(), 0);
SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
if (R.isAnalyzedReductionRoot(Inst))
return nullptr;
if (!isReductionCandidate(Inst))
Expand Down Expand Up @@ -20049,10 +20049,9 @@ bool SLPVectorizerPass::vectorizeHorReduction(
}

bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI) {
BasicBlock *BB, BoUpSLP &R) {
SmallVector<WeakTrackingVH> PostponedInsts;
bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
Res |= tryToVectorize(PostponedInsts, R);
return Res;
}
Expand Down Expand Up @@ -20317,7 +20316,7 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
continue;
for (Value *Op : I->operands())
if (auto *RootOp = dyn_cast<Instruction>(Op))
Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
}
// Try to vectorize operands as vector bundles.
for (CmpInst *I : CmpInsts) {
Expand Down Expand Up @@ -20384,7 +20383,7 @@ bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
// pass2 - try to vectorize reductions only
if (R.isDeleted(I))
continue;
OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
if (R.isDeleted(I) || isa<CmpInst>(I))
continue;
// pass3 - try to match and vectorize a buildvector sequence.
Expand Down Expand Up @@ -20644,7 +20643,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (P->getNumIncomingValues() == 2) {
// Try to match and vectorize a horizontal reduction.
Instruction *Root = getReductionInstr(DT, P, BB, LI);
if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
Changed = true;
It = BB->begin();
E = BB->end();
Expand All @@ -20666,8 +20665,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// vectorization.
if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
PI && !IsInPostProcessInstrs(PI)) {
bool Res = vectorizeRootInstruction(nullptr, PI,
P->getIncomingBlock(I), R, TTI);
bool Res =
vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
Changed |= Res;
if (Res && R.isDeleted(P)) {
It = BB->begin();
Expand Down Expand Up @@ -20701,7 +20700,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (auto *VI = dyn_cast<Instruction>(V);
VI && !IsInPostProcessInstrs(VI))
// Try to match and vectorize a horizontal reduction.
OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
}
}
// Start vectorization of post-process list of instructions from the
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,11 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
/// Return the cost of this VPSingleDefRecipe.
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print this VPSingleDefRecipe to dbgs() (for debugging).
LLVM_DUMP_METHOD void dump() const;
#endif
};

/// Class to record LLVM IR flag for a recipe along with it.
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,10 @@ FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
return Res;
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPSingleDefRecipe::dump() const { VPDef::dump(); }
#endif

template <unsigned PartOpIdx>
VPValue *
VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ define void @test() #0 {
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[INDVARS_IV]], 39
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp samesign ult i64 [[INDVARS_IV]], 39
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
; CHECK: for.end:
; CHECK-NEXT: ret void
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

@oStruct = external global %struct.Outer, align 4

define void @main(i8 %val8) nounwind {
define void @main(i8 %val8) nounwind "frame-pointer"="none" {
; CHECK-LABEL: main:
; CHECK: @ %bb.0: @ %for.body.lr.ph
; CHECK-NEXT: movw r0, :lower16:(L_oStruct$non_lazy_ptr-(LPC0_0+4))
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/ARM/2011-12-19-sjlj-clobber.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
; Radar 10567930: Make sure that all the caller-saved registers are saved and
; restored in a function with setjmp/longjmp EH. In particular, r6 was not
; being saved here.
; CHECK: push {r4, r5, r6, r7, lr}
; CHECK: push.w {r4, r5, r6, r7, r8, r10, r11, lr}

%0 = type opaque
%struct.NSConstantString = type { ptr, i32, ptr, i32 }
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1732,7 +1732,7 @@ if.end:
; Another infinite loop test this time with two nested infinite loop.
; infiniteloop3
; bx lr
define void @infiniteloop3() "frame-pointer"="all" {
define void @infiniteloop3() "frame-pointer"="none" {
; ARM-LABEL: infiniteloop3:
; ARM: @ %bb.0: @ %entry
; ARM-NEXT: mov r0, #0
Expand Down
56 changes: 26 additions & 30 deletions llvm/test/CodeGen/ARM/atomic-load-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -324,18 +324,17 @@ define void @test_old_store_64bit(ptr %p, i64 %v) {
;
; ARMOPTNONE-LABEL: test_old_store_64bit:
; ARMOPTNONE: @ %bb.0:
; ARMOPTNONE-NEXT: push {r4, r5, r7, lr}
; ARMOPTNONE-NEXT: add r7, sp, #8
; ARMOPTNONE-NEXT: push {r8, r10, r11}
; ARMOPTNONE-NEXT: sub sp, sp, #24
; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill
; ARMOPTNONE-NEXT: dmb ish
; ARMOPTNONE-NEXT: ldr r1, [r0]
; ARMOPTNONE-NEXT: ldr r0, [r0, #4]
; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill
; ARMOPTNONE-NEXT: push {r4, r5, r7, r8, r10, r11, lr}
; ARMOPTNONE-NEXT: add r7, sp, #20
; ARMOPTNONE-NEXT: sub sp, sp, #24
; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill
; ARMOPTNONE-NEXT: dmb ish
; ARMOPTNONE-NEXT: ldr r1, [r0]
; ARMOPTNONE-NEXT: ldr r0, [r0, #4]
; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill
; ARMOPTNONE-NEXT: b LBB5_1
; ARMOPTNONE-NEXT: LBB5_1: @ %atomicrmw.start
; ARMOPTNONE-NEXT: @ =>This Loop Header: Depth=1
Expand Down Expand Up @@ -382,8 +381,7 @@ define void @test_old_store_64bit(ptr %p, i64 %v) {
; ARMOPTNONE-NEXT: LBB5_5: @ %atomicrmw.end
; ARMOPTNONE-NEXT: dmb ish
; ARMOPTNONE-NEXT: sub sp, r7, #20
; ARMOPTNONE-NEXT: pop {r8, r10, r11}
; ARMOPTNONE-NEXT: pop {r4, r5, r7, pc}
; ARMOPTNONE-NEXT: pop {r4, r5, r7, r8, r10, r11, pc}
;
; THUMBTWO-LABEL: test_old_store_64bit:
; THUMBTWO: @ %bb.0:
Expand Down Expand Up @@ -864,20 +862,19 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
;
; ARMOPTNONE-LABEL: store_atomic_f64__seq_cst:
; ARMOPTNONE: @ %bb.0:
; ARMOPTNONE-NEXT: push {r4, r5, r7, lr}
; ARMOPTNONE-NEXT: add r7, sp, #8
; ARMOPTNONE-NEXT: push {r8, r10, r11}
; ARMOPTNONE-NEXT: sub sp, sp, #24
; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill
; ARMOPTNONE-NEXT: vmov d16, r1, r2
; ARMOPTNONE-NEXT: vmov r1, r2, d16
; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill
; ARMOPTNONE-NEXT: dmb ish
; ARMOPTNONE-NEXT: ldr r1, [r0]
; ARMOPTNONE-NEXT: ldr r0, [r0, #4]
; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill
; ARMOPTNONE-NEXT: push {r4, r5, r7, r8, r10, r11, lr}
; ARMOPTNONE-NEXT: add r7, sp, #20
; ARMOPTNONE-NEXT: sub sp, sp, #24
; ARMOPTNONE-NEXT: str r0, [sp, #4] @ 4-byte Spill
; ARMOPTNONE-NEXT: vmov d16, r1, r2
; ARMOPTNONE-NEXT: vmov r1, r2, d16
; ARMOPTNONE-NEXT: str r2, [sp, #8] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r1, [sp, #12] @ 4-byte Spill
; ARMOPTNONE-NEXT: dmb ish
; ARMOPTNONE-NEXT: ldr r1, [r0]
; ARMOPTNONE-NEXT: ldr r0, [r0, #4]
; ARMOPTNONE-NEXT: str r1, [sp, #16] @ 4-byte Spill
; ARMOPTNONE-NEXT: str r0, [sp, #20] @ 4-byte Spill
; ARMOPTNONE-NEXT: b LBB13_1
; ARMOPTNONE-NEXT: LBB13_1: @ %atomicrmw.start
; ARMOPTNONE-NEXT: @ =>This Loop Header: Depth=1
Expand Down Expand Up @@ -924,8 +921,7 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
; ARMOPTNONE-NEXT: LBB13_5: @ %atomicrmw.end
; ARMOPTNONE-NEXT: dmb ish
; ARMOPTNONE-NEXT: sub sp, r7, #20
; ARMOPTNONE-NEXT: pop {r8, r10, r11}
; ARMOPTNONE-NEXT: pop {r4, r5, r7, pc}
; ARMOPTNONE-NEXT: pop {r4, r5, r7, r8, r10, r11, pc}
;
; THUMBTWO-LABEL: store_atomic_f64__seq_cst:
; THUMBTWO: @ %bb.0:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/ARM/call-tc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ define void @t1() "frame-pointer"="all" {
ret void
}

define void @t2() "frame-pointer"="all" {
define void @t2() "frame-pointer"="none" {
; CHECKV6-LABEL: t2:
; CHECKV6: bx r0
; CHECKT2D-LABEL: t2:
Expand Down Expand Up @@ -102,7 +102,7 @@ bb:

; Make sure codegenprep is duplicating ret instructions to enable tail calls.
; rdar://11140249
define i32 @t8(i32 %x) nounwind ssp "frame-pointer"="all" {
define i32 @t8(i32 %x) nounwind ssp "frame-pointer"="none" {
entry:
; CHECKT2D-LABEL: t8:
; CHECKT2D-NOT: push
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/ARM/debug-frame.ll
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ entry:
; Test 4
;-------------------------------------------------------------------------------

define void @test4() nounwind {
define void @test4() nounwind "frame-pointer"="none" {
entry:
ret void
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/ARM/ehabi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ entry:
; Test 4
;-------------------------------------------------------------------------------

define void @test4() nounwind {
define void @test4() nounwind "frame-pointer"="none" {
entry:
ret void
}
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ entry:
; DARWIN-THUMB2: mov r0, r7

; LINUX-ARM-LABEL: frameaddr_index0:
; LINUX-ARM: push {r11, lr}
; LINUX-ARM: push {r11}
; LINUX-ARM: mov r11, sp
; LINUX-ARM: mov r0, r11

Expand All @@ -42,7 +42,7 @@ entry:
; DARWIN-THUMB2: ldr r0, [r7]

; LINUX-ARM-LABEL: frameaddr_index1:
; LINUX-ARM: push {r11, lr}
; LINUX-ARM: push {r11}
; LINUX-ARM: mov r11, sp
; LINUX-ARM: ldr r0, [r11]

Expand Down Expand Up @@ -73,7 +73,7 @@ entry:
; DARWIN-THUMB2: ldr r0, [r0]

; LINUX-ARM-LABEL: frameaddr_index3:
; LINUX-ARM: push {r11, lr}
; LINUX-ARM: push {r11}
; LINUX-ARM: mov r11, sp
; LINUX-ARM: ldr r0, [r11]
; LINUX-ARM: ldr r0, [r0]
Expand Down
11 changes: 7 additions & 4 deletions llvm/test/CodeGen/ARM/frame-chain.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
define dso_local noundef i32 @leaf(i32 noundef %0) {
; LEAF-FP-LABEL: leaf:
; LEAF-FP: @ %bb.0:
; LEAF-FP-NEXT: .pad #4
; LEAF-FP-NEXT: sub sp, sp, #4
; LEAF-FP-NEXT: str r0, [sp]
; LEAF-FP-NEXT: .save {r11, lr}
; LEAF-FP-NEXT: push {r11, lr}
; LEAF-FP-NEXT: .setfp r11, sp
; LEAF-FP-NEXT: mov r11, sp
; LEAF-FP-NEXT: push {r0}
; LEAF-FP-NEXT: add r0, r0, #4
; LEAF-FP-NEXT: add sp, sp, #4
; LEAF-FP-NEXT: mov sp, r11
; LEAF-FP-NEXT: pop {r11, lr}
; LEAF-FP-NEXT: mov pc, lr
;
; LEAF-FP-AAPCS-LABEL: leaf:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/ARM/ifcvt5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

@x = external global ptr ; <ptr> [#uses=1]

define void @foo(i32 %a) "frame-pointer"="all" {
define void @foo(i32 %a) "frame-pointer"="none" {
; A8-LABEL: foo:
; A8: @ %bb.0: @ %entry
; A8-NEXT: movw r1, :lower16:(L_x$non_lazy_ptr-(LPC0_0+8))
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/ARM/ldrd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ define void @ldrd_postupdate_inc(ptr %p0) "frame-pointer"="all" {
; NORMAL: strd r1, r2, [r0], #-8
; CONSERVATIVE-NOT: strd
; CHECK: bx lr
define ptr @strd_postupdate_dec(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="all" {
define ptr @strd_postupdate_dec(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="none" {
%p0.1 = getelementptr i32, ptr %p0, i32 1
store i32 %v0, ptr %p0
store i32 %v1, ptr %p0.1
Expand All @@ -180,7 +180,7 @@ define ptr @strd_postupdate_dec(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="all"
; NORMAL: strd r1, r2, [r0], #8
; CONSERVATIVE-NOT: strd
; CHECK: bx lr
define ptr @strd_postupdate_inc(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="all" {
define ptr @strd_postupdate_inc(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="none" {
%p0.1 = getelementptr i32, ptr %p0, i32 1
store i32 %v0, ptr %p0
store i32 %v1, ptr %p0.1
Expand Down
9 changes: 5 additions & 4 deletions llvm/test/CodeGen/ARM/stack-frame-layout-remarks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
; BOTH: [SP-8]{{.+}}8{{.+}}4
; DEBUG: a @ dot.c:13
; STRIPPED-NOT: a @ dot.c:13
define void @cleanup_array(ptr %0) #1 {
define void @cleanup_array(ptr %0) #3 {
%2 = alloca ptr, align 8
store ptr %0, ptr %2, align 8
call void @llvm.dbg.declare(metadata ptr %2, metadata !41, metadata !DIExpression()), !dbg !46
Expand All @@ -62,7 +62,7 @@ define void @cleanup_array(ptr %0) #1 {
; BOTH: [SP-8]{{.+}}8{{.+}}4
; DEBUG: res @ dot.c:21
; STRIPPED-NOT: res @ dot.c:21
define void @cleanup_result(ptr %0) #1 {
define void @cleanup_result(ptr %0) #3 {
%2 = alloca ptr, align 8
store ptr %0, ptr %2, align 8
call void @llvm.dbg.declare(metadata ptr %2, metadata !47, metadata !DIExpression()), !dbg !51
Expand Down Expand Up @@ -92,7 +92,7 @@ define void @cleanup_result(ptr %0) #1 {
; BOTH: [SP-40]{{.+}}4{{.+}}4
; DEBUG: i @ dot.c:55
; STRIPPED-NOT: i @ dot.c:55
define i32 @do_work(ptr %0, ptr %1, ptr %2) #1 {
define i32 @do_work(ptr %0, ptr %1, ptr %2) #3 {
%4 = alloca i32, align 4
%5 = alloca ptr, align 8
%6 = alloca ptr, align 8
Expand Down Expand Up @@ -144,7 +144,7 @@ define i32 @do_work(ptr %0, ptr %1, ptr %2) #1 {
; BOTH: [SP-20]{{.+}}4{{.*}}4
; DEBUG: i @ dot.c:69
; STRIPPED-NOT: i @ dot.c:69
define ptr @gen_array(i32 %0) #1 {
define ptr @gen_array(i32 %0) #3 {
%2 = alloca ptr, align 8
%3 = alloca i32, align 4
%4 = alloca ptr, align 8
Expand Down Expand Up @@ -227,6 +227,7 @@ uselistorder ptr @llvm.dbg.declare, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
attributes #1 = { "frame-pointer"="all" }
attributes #2 = { ssp "stack-protector-buffer-size"="5" "frame-pointer"="all" }
attributes #3 = { "frame-pointer"="none" }

!llvm.dbg.cu = !{!0, !2}
!llvm.module.flags = !{!18, !19, !20, !21, !22, !23, !24}
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/ARM/stack-size-section.ll
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ define void @dynalloc(i32 %N) #0 {
ret void
}

attributes #0 = { "frame-pointer"="all" }
attributes #0 = { "frame-pointer"="none" }
302 changes: 143 additions & 159 deletions llvm/test/CodeGen/ARM/swifterror.ll

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions llvm/test/CodeGen/ARM/v7k-abi-align.ll
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ define void @test_dpr_unwind_align_no_dprs() "frame-pointer"="all" {

; 128-bit vectors should use 128-bit (i.e. correctly aligned) slots on
; the stack.
define <4 x float> @test_v128_stack_pass([8 x double], float, <4 x float> %in) "frame-pointer"="all" {
define <4 x float> @test_v128_stack_pass([8 x double], float, <4 x float> %in) "frame-pointer"="none" {
; CHECK-LABEL: test_v128_stack_pass:
; CHECK: add r[[ADDR:[0-9]+]], sp, #16
; CHECK: vld1.64 {d0, d1}, [r[[ADDR]]:128]
Expand All @@ -140,7 +140,7 @@ define void @test_v128_stack_pass_varargs(<4 x float> %in) "frame-pointer"="all"

; To be compatible with AAPCS's va_start model (store r0-r3 at incoming SP, give
; a single pointer), 64-bit quantities must be pass
define i64 @test_64bit_gpr_align(i32, i64 %r2_r3, i32 %sp) "frame-pointer"="all" {
define i64 @test_64bit_gpr_align(i32, i64 %r2_r3, i32 %sp) "frame-pointer"="none" {
; CHECK-LABEL: test_64bit_gpr_align:
; CHECK: ldr [[RHS:r[0-9]+]], [sp]
; CHECK: adds r0, [[RHS]], r2
Expand Down
16 changes: 10 additions & 6 deletions llvm/test/CodeGen/Thumb/frame-chain.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@
define dso_local noundef i32 @leaf(i32 noundef %0) {
; LEAF-FP-LABEL: leaf:
; LEAF-FP: @ %bb.0:
; LEAF-FP-NEXT: .pad #4
; LEAF-FP-NEXT: sub sp, #4
; LEAF-FP-NEXT: str r0, [sp]
; LEAF-FP-NEXT: adds r0, r0, #4
; LEAF-FP-NEXT: add sp, #4
; LEAF-FP-NEXT: bx lr
; LEAF-FP-NEXT: .save {r7, lr}
; LEAF-FP-NEXT: push {r7, lr}
; LEAF-FP-NEXT: .setfp r7, sp
; LEAF-FP-NEXT: add r7, sp, #0
; LEAF-FP-NEXT: .pad #4
; LEAF-FP-NEXT: sub sp, #4
; LEAF-FP-NEXT: str r0, [sp]
; LEAF-FP-NEXT: adds r0, r0, #4
; LEAF-FP-NEXT: add sp, #4
; LEAF-FP-NEXT: pop {r7, pc}
;
; LEAF-FP-AAPCS-LABEL: leaf:
; LEAF-FP-AAPCS: @ %bb.0:
Expand Down
67 changes: 10 additions & 57 deletions llvm/test/CodeGen/Thumb2/avoidmuls.mir
Original file line number Diff line number Diff line change
@@ -1,67 +1,20 @@
# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s
# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m33 -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MUL
# RUN: llc -mtriple=thumbv7m-none-eabi --run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MULS

--- |
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8m.main-arm-none-eabi"

; Function Attrs: norecurse nounwind readnone
define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 {
entry:
%cmp6 = icmp sgt i32 %y, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader: ; preds = %entry
br label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ]
ret i32 %sum.0.lcssa

for.body: ; preds = %for.body, %for.body.preheader
%lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ]
%lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ]
%sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
%mul = mul nsw i32 %lsr.iv1, %sum.07
%lsr.iv.next = add i32 %lsr.iv, -1
%lsr.iv.next2 = add i32 %lsr.iv1, 1
%exitcond = icmp eq i32 %lsr.iv.next, 0
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m33" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }

...
---
name: test
tracksRegLiveness: true
liveins:
- { reg: '$r0', virtual-reg: '' }
- { reg: '$r1', virtual-reg: '' }
body: |
bb.0.entry:
successors: %bb.1.for.body, %bb.2.for.cond.cleanup
liveins: $r0, $r1

bb.0:
$r2 = tMOVr $r0, 14, _
$r0 = t2MOVi 1, 14, _, _
t2CMPri $r1, 1, 14, _, implicit-def $cpsr
t2Bcc %bb.2.for.cond.cleanup, 11, killed $cpsr

bb.1.for.body:
successors: %bb.2.for.cond.cleanup, %bb.1.for.body
liveins: $r0, $r1, $r2

$r0 = t2MUL $r2, killed $r0, 14, _
$r2 = t2ADDri killed $r2, 1, 14, _, _
$r1 = t2SUBri killed $r1, 1, 14, _, def $cpsr
t2Bcc %bb.1.for.body, 1, killed $cpsr

bb.2.for.cond.cleanup:
liveins: $r0

tBX_RET 14, _, implicit $r0

...
# CHECK-LABEL: test
# CHECK: tMUL
# CHECK-NOT: t2MUL
# MUL-LABEL: test
# MUL: t2MUL
# MUL-NOT: tMUL

# MULS-LABEL: test
# MULS: tMUL
# MULS-NOT: t2MUL
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/frame-pointer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ define void @leaf() {

; Leaf function, frame pointer is requested but we don't need any stack frame,
; so don't create a frame pointer.
define void @leaf_nofpelim() "frame-pointer"="all" {
define void @leaf_nofpelim() "frame-pointer"="none" {
; CHECK-LABEL: leaf_nofpelim:
; CHECK-NOT: push
; CHECK-NOT: sp
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/frameless.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=all | not grep mov
; RUN: llc < %s -mtriple=thumbv7-linux -frame-pointer=all | not grep mov
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=none | not grep mov
; RUN: llc < %s -mtriple=thumbv7-linux -frame-pointer=none | not grep mov

define void @t() nounwind readnone {
ret void
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Thumb2/frameless2.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=all | not grep r7
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=none | not grep r7

%struct.noise3 = type { [3 x [17 x i32]] }
%struct.noiseguard = type { i32, i32, i32 }
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/Thumb2/machine-licm.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -frame-pointer=all | FileCheck %s
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -frame-pointer=all | FileCheck %s --check-prefix=PIC
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -frame-pointer=none | FileCheck %s
; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -frame-pointer=none | FileCheck %s --check-prefix=PIC
; rdar://7353541
; rdar://7354376

Expand Down
150 changes: 150 additions & 0 deletions llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=thumbv8.1m.main-none-eabi < %s --force-dwarf-frame-section -frame-pointer=all -mattr=+aapcs-frame-chain | FileCheck %s

; int test1() {
; return 0;
; }
define i32 @test1() "sign-return-address"="non-leaf" {
; CHECK-LABEL: test1:
; CHECK: .cfi_sections .debug_frame
; CHECK-NEXT: .cfi_startproc
; CHECK-NEXT: @ %bb.0: @ %entry
; CHECK-NEXT: pac r12, lr, sp
; CHECK-NEXT: .save {ra_auth_code}
; CHECK-NEXT: str r12, [sp, #-4]!
; CHECK-NEXT: .cfi_def_cfa_offset 4
; CHECK-NEXT: .cfi_offset ra_auth_code, -4
; CHECK-NEXT: .save {r11, lr}
; CHECK-NEXT: push.w {r11, lr}
; CHECK-NEXT: .cfi_def_cfa_offset 12
; CHECK-NEXT: .cfi_offset lr, -8
; CHECK-NEXT: .cfi_offset r11, -12
; CHECK-NEXT: .setfp r11, sp
; CHECK-NEXT: mov r11, sp
; CHECK-NEXT: .cfi_def_cfa_register r11
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: pop.w {r11, lr}
; CHECK-NEXT: ldr r12, [sp], #4
; CHECK-NEXT: aut r12, lr, sp
; CHECK-NEXT: bx lr
entry:
ret i32 0
}

; void foo(int n) {
; int a[n];
; bar(a);
; }
define dso_local void @test2(i32 noundef %n) "sign-return-address"="non-leaf" {
; CHECK-LABEL: test2:
; CHECK: .cfi_startproc
; CHECK-NEXT: @ %bb.0: @ %entry
; CHECK-NEXT: pac r12, lr, sp
; CHECK-NEXT: .save {r4, r7, ra_auth_code}
; CHECK-NEXT: push.w {r4, r7, r12}
; CHECK-NEXT: .cfi_def_cfa_offset 12
; CHECK-NEXT: .cfi_offset ra_auth_code, -4
; CHECK-NEXT: .cfi_offset r7, -8
; CHECK-NEXT: .cfi_offset r4, -12
; CHECK-NEXT: .save {r11, lr}
; CHECK-NEXT: push.w {r11, lr}
; CHECK-NEXT: .cfi_def_cfa_offset 20
; CHECK-NEXT: .cfi_offset lr, -16
; CHECK-NEXT: .cfi_offset r11, -20
; CHECK-NEXT: .setfp r11, sp
; CHECK-NEXT: mov r11, sp
; CHECK-NEXT: .cfi_def_cfa_register r11
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: movs r1, #7
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: bic r0, r0, #7
; CHECK-NEXT: sub.w r0, sp, r0
; CHECK-NEXT: mov sp, r0
; CHECK-NEXT: bl take_ptr
; CHECK-NEXT: mov sp, r11
; CHECK-NEXT: pop.w {r11, lr}
; CHECK-NEXT: pop.w {r4, r7, r12}
; CHECK-NEXT: aut r12, lr, sp
; CHECK-NEXT: bx lr
entry:
%vla = alloca i32, i32 %n, align 4
call void @take_ptr(ptr noundef nonnull %vla)
ret void
}

; void test3(int c, float e, int z) {
; if (c)
; knr();
; take_ptr(alloca(z));
; if (e)
; knr();
; }
define void @test3(i32 noundef %c, float noundef %e, i32 noundef %z) "sign-return-address"="non-leaf" {
; CHECK-LABEL: test3:
; CHECK: .cfi_startproc
; CHECK-NEXT: @ %bb.0: @ %entry
; CHECK-NEXT: pac r12, lr, sp
; CHECK-NEXT: .save {r4, r5, r6, r7, ra_auth_code}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r12}
; CHECK-NEXT: .cfi_def_cfa_offset 20
; CHECK-NEXT: .cfi_offset ra_auth_code, -4
; CHECK-NEXT: .cfi_offset r7, -8
; CHECK-NEXT: .cfi_offset r6, -12
; CHECK-NEXT: .cfi_offset r5, -16
; CHECK-NEXT: .cfi_offset r4, -20
; CHECK-NEXT: .save {r11, lr}
; CHECK-NEXT: push.w {r11, lr}
; CHECK-NEXT: .cfi_def_cfa_offset 28
; CHECK-NEXT: .cfi_offset lr, -24
; CHECK-NEXT: .cfi_offset r11, -28
; CHECK-NEXT: .setfp r11, sp
; CHECK-NEXT: mov r11, sp
; CHECK-NEXT: .cfi_def_cfa_register r11
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: mov r4, r1
; CHECK-NEXT: it ne
; CHECK-NEXT: blne knr
; CHECK-NEXT: adds r0, r5, #7
; CHECK-NEXT: bic r0, r0, #7
; CHECK-NEXT: sub.w r0, sp, r0
; CHECK-NEXT: mov sp, r0
; CHECK-NEXT: bl take_ptr
; CHECK-NEXT: mov r0, r4
; CHECK-NEXT: movs r1, #0
; CHECK-NEXT: bl __aeabi_fcmpeq
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: it eq
; CHECK-NEXT: bleq knr
; CHECK-NEXT: mov sp, r11
; CHECK-NEXT: pop.w {r11, lr}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r12}
; CHECK-NEXT: aut r12, lr, sp
; CHECK-NEXT: bx lr
entry:
%tobool.not = icmp eq i32 %c, 0
br i1 %tobool.not, label %if.end, label %if.then

if.then: ; preds = %entry
tail call void @knr()
br label %if.end

if.end: ; preds = %if.then, %entry
%0 = alloca i8, i32 %z, align 8
call void @take_ptr(ptr noundef nonnull %0)
%tobool1 = fcmp une float %e, 0.000000e+00
br i1 %tobool1, label %if.then2, label %if.end3

if.then2: ; preds = %if.end
call void @knr()
br label %if.end3

if.end3: ; preds = %if.then2, %if.end
ret void
}

declare void @knr(...)
declare void @take_ptr(ptr noundef)
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1928,7 +1928,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpminuw %xmm1, %xmm0, %xmm2
; AVX512F-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3
; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX512F-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
; AVX512F-NEXT: vpsubw %xmm2, %xmm1, %xmm1
Expand All @@ -1945,7 +1945,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2
; AVX512VL-FALLBACK-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
Expand Down Expand Up @@ -2500,7 +2500,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down Expand Up @@ -2706,7 +2706,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpminub %xmm1, %xmm0, %xmm2
; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3
; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3
; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX512F-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
; AVX512F-NEXT: vpsubb %xmm2, %xmm1, %xmm1
Expand All @@ -2728,8 +2728,8 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm2, %xmm2, %xmm2
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} xmm2 = ~xmm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down Expand Up @@ -2961,7 +2961,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm0 ^ (xmm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm0, %xmm2, %xmm0
; AVX512VL-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down Expand Up @@ -3192,7 +3192,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down Expand Up @@ -3432,7 +3432,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm3, %xmm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down
18 changes: 9 additions & 9 deletions llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1434,7 +1434,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3
; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
Expand All @@ -1450,7 +1450,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2
; AVX512VL-FALLBACK-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -2016,7 +2016,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down Expand Up @@ -2169,7 +2169,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3
; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3
; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
Expand All @@ -2193,8 +2193,8 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm1 = ymm2 ^ (ymm1 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down Expand Up @@ -2372,7 +2372,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm0 ^ (ymm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm2, %ymm0
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down Expand Up @@ -2550,7 +2550,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down Expand Up @@ -2733,7 +2733,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
Expand Down
40 changes: 20 additions & 20 deletions llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nou
; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
Expand All @@ -315,7 +315,7 @@ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nou
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -365,7 +365,7 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
Expand All @@ -392,7 +392,7 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -445,7 +445,7 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw
; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
Expand Down Expand Up @@ -473,7 +473,7 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
Expand Down Expand Up @@ -526,7 +526,7 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw
; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -554,7 +554,7 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -608,7 +608,7 @@ define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
Expand Down Expand Up @@ -637,7 +637,7 @@ define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
Expand Down Expand Up @@ -700,7 +700,7 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -730,7 +730,7 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -784,7 +784,7 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -814,7 +814,7 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $184, %zmm4, %zmm5, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -872,7 +872,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
Expand Down Expand Up @@ -904,7 +904,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
Expand Down Expand Up @@ -962,7 +962,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -994,7 +994,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
Expand Down Expand Up @@ -1053,7 +1053,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
Expand Down Expand Up @@ -1086,7 +1086,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq $226, %zmm5, %zmm4, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
Expand Down
Loading