96 changes: 48 additions & 48 deletions libcxx/include/__availability

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion libcxx/include/__type_traits/is_equality_comparable.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#include <__type_traits/is_signed.h>
#include <__type_traits/is_void.h>
#include <__type_traits/remove_cv.h>
#include <__type_traits/remove_cvref.h>
#include <__type_traits/void_t.h>
#include <__utility/declval.h>

Expand Down
8 changes: 4 additions & 4 deletions lld/MachO/ObjC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ class ObjcCategoryMerger {
void eraseMergedCategories();

void generateCatListForNonErasedCategories(
std::map<ConcatInputSection *, std::set<uint64_t>>
MapVector<ConcatInputSection *, std::set<uint64_t>>
catListToErasedOffsets);
void collectSectionWriteInfoFromIsec(const InputSection *isec,
InfoWriteSection &catWriteInfo);
Expand Down Expand Up @@ -491,7 +491,7 @@ class ObjcCategoryMerger {
InfoCategoryWriter infoCategoryWriter;
std::vector<ConcatInputSection *> &allInputSections;
// Map of base class Symbol to list of InfoInputCategory's for it
DenseMap<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
MapVector<const Symbol *, std::vector<InfoInputCategory>> categoryMap;
// Set for tracking InputSection erased via eraseISec
DenseSet<InputSection *> erasedIsecs;

Expand Down Expand Up @@ -1104,7 +1104,7 @@ void ObjcCategoryMerger::collectAndValidateCategoriesData() {
// (not erased). For these not erased categories, we generate new __objc_catlist
// entries since the parent __objc_catlist entry will be erased
void ObjcCategoryMerger::generateCatListForNonErasedCategories(
const std::map<ConcatInputSection *, std::set<uint64_t>>
const MapVector<ConcatInputSection *, std::set<uint64_t>>
catListToErasedOffsets) {

// Go through all offsets of all __objc_catlist's that we process and if there
Expand Down Expand Up @@ -1171,7 +1171,7 @@ void ObjcCategoryMerger::eraseISec(ConcatInputSection *isec) {
// them.
void ObjcCategoryMerger::eraseMergedCategories() {
// Map of InputSection to a set of offsets of the categories that were merged
std::map<ConcatInputSection *, std::set<uint64_t>> catListToErasedOffsets;
MapVector<ConcatInputSection *, std::set<uint64_t>> catListToErasedOffsets;

for (auto &mapEntry : categoryMap) {
for (InfoInputCategory &catInfo : mapEntry.second) {
Expand Down
3 changes: 2 additions & 1 deletion lldb/docs/use/qemu-testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ forwarded for this to work.

.. note::
These options are used to create a "port map" within ``lldb-server``.
Unfortunately this map is not shared across all the processes it may create,
Unfortunately this map is not cleaned up on Windows on connection close,
and across a few uses you may run out of valid ports. To work around this,
restart the platform every so often, especially after running a set of tests.
This is tracked here: https://github.com/llvm/llvm-project/issues/90923
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,19 @@ DynamicLoaderPOSIXDYLD::GetStepThroughTrampolinePlan(Thread &thread,
Target &target = thread.GetProcess()->GetTarget();
const ModuleList &images = target.GetImages();

llvm::StringRef target_name = sym_name.GetStringRef();
// On AArch64, the trampoline name has a prefix (__AArch64ADRPThunk_ or
// __AArch64AbsLongThunk_) added to the function name. If we detect a
// trampoline with the prefix, we need to remove the prefix to find the
// function symbol.
if (target_name.consume_front("__AArch64ADRPThunk_") ||
target_name.consume_front("__AArch64AbsLongThunk_")) {
// An empty target name can happen for trampolines generated for
// section-referencing relocations.
if (!target_name.empty()) {
sym_name = ConstString(target_name);
}
}
images.FindSymbolsWithNameAndType(sym_name, eSymbolTypeCode, target_symbols);
if (!target_symbols.GetSize())
return thread_plan_sp;
Expand Down
19 changes: 18 additions & 1 deletion lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2356,13 +2356,30 @@ unsigned ObjectFileELF::ParseSymbols(Symtab *symtab, user_id_t start_id,
bool symbol_size_valid =
symbol.st_size != 0 || symbol.getType() != STT_FUNC;

bool is_trampoline = false;
if (arch.IsValid() && (arch.GetMachine() == llvm::Triple::aarch64)) {
// On AArch64, trampolines are registered as code.
// If we detect a trampoline (which starts with __AArch64ADRPThunk_ or
// __AArch64AbsLongThunk_) we register the symbol as a trampoline. This
// way we will be able to detect the trampoline when we step in a function
// and step through the trampoline.
if (symbol_type == eSymbolTypeCode) {
llvm::StringRef trampoline_name = mangled.GetName().GetStringRef();
if (trampoline_name.starts_with("__AArch64ADRPThunk_") ||
trampoline_name.starts_with("__AArch64AbsLongThunk_")) {
symbol_type = eSymbolTypeTrampoline;
is_trampoline = true;
}
}
}

Symbol dc_symbol(
i + start_id, // ID is the original symbol table index.
mangled,
symbol_type, // Type of this symbol
is_global, // Is this globally visible?
false, // Is this symbol debug info?
false, // Is this symbol a trampoline?
is_trampoline, // Is this symbol a trampoline?
false, // Is this symbol artificial?
AddressRange(symbol_section_sp, // Section in which this symbol is
// defined or null.
Expand Down
15 changes: 15 additions & 0 deletions lldb/test/Shell/ExecControl/StepIn/Inputs/aarch64_thunk.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
extern "C" int __attribute__((naked)) __AArch64ADRPThunk_step_here() {
asm (
"adrp x16, step_here\n"
"add x16, x16, :lo12:step_here\n"
"br x16"
);
}

extern "C" __attribute__((used)) int step_here() {
return 47;
}

int main() {
return __AArch64ADRPThunk_step_here();
}
17 changes: 17 additions & 0 deletions lldb/test/Shell/ExecControl/StepIn/step_through-aarch64-thunk.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# REQUIRES: native && target-aarch64

# This test is specific to elf platforms.
# UNSUPPORTED: system-windows, system-darwin

# RUN: %clangxx_host %p/Inputs/aarch64_thunk.cc -g -o %t.out
# RUN: %lldb %t.out -s %s | FileCheck %s

b main
# CHECK: Breakpoint 1: where = step_through-aarch64-thunk.test.tmp.out`main

r
# CHECK: stop reason = breakpoint 1.1

s
# CHECK: stop reason = step in
# CHECK: frame #0: {{.*}} step_through-aarch64-thunk.test.tmp.out`::step_here()
49 changes: 36 additions & 13 deletions lldb/tools/lldb-server/lldb-platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,17 +282,12 @@ int main_platform(int argc, char *argv[]) {
}
}

do {
GDBRemoteCommunicationServerPlatform platform(
acceptor_up->GetSocketProtocol(), acceptor_up->GetSocketScheme());

if (port_offset > 0)
platform.SetPortOffset(port_offset);

if (!gdbserver_portmap.empty()) {
platform.SetPortMap(std::move(gdbserver_portmap));
}
GDBRemoteCommunicationServerPlatform platform(
acceptor_up->GetSocketProtocol(), acceptor_up->GetSocketScheme());
if (port_offset > 0)
platform.SetPortOffset(port_offset);

do {
const bool children_inherit_accept_socket = true;
Connection *conn = nullptr;
error = acceptor_up->Accept(children_inherit_accept_socket, conn);
Expand All @@ -301,13 +296,37 @@ int main_platform(int argc, char *argv[]) {
exit(socket_error);
}
printf("Connection established.\n");

if (g_server) {
// Collect child zombie processes.
#if !defined(_WIN32)
while (waitpid(-1, nullptr, WNOHANG) > 0)
;
::pid_t waitResult;
while ((waitResult = waitpid(-1, nullptr, WNOHANG)) > 0) {
// waitResult is the child pid
gdbserver_portmap.FreePortForProcess(waitResult);
}
#endif
if (fork()) {
// TODO: Clean up portmap for Windows when children die
// See https://github.com/llvm/llvm-project/issues/90923

// After collecting zombie ports, get the next available
GDBRemoteCommunicationServerPlatform::PortMap portmap_for_child;
llvm::Expected<uint16_t> available_port =
gdbserver_portmap.GetNextAvailablePort();
if (available_port)
portmap_for_child.AllowPort(*available_port);
else {
llvm::consumeError(available_port.takeError());
fprintf(stderr,
"no available gdbserver port for connection - dropping...\n");
delete conn;
continue;
}
platform.SetPortMap(std::move(portmap_for_child));

auto childPid = fork();
if (childPid) {
gdbserver_portmap.AssociatePortWithProcess(*available_port, childPid);
// Parent doesn't need a connection to the lldb client
delete conn;

Expand All @@ -323,7 +342,11 @@ int main_platform(int argc, char *argv[]) {
// If not running as a server, this process will not accept
// connections while a connection is active.
acceptor_up.reset();

// When not running in server mode, use all available ports
platform.SetPortMap(std::move(gdbserver_portmap));
}

platform.SetConnection(std::unique_ptr<Connection>(conn));

if (platform.IsConnected()) {
Expand Down
10 changes: 10 additions & 0 deletions llvm/docs/CommandGuide/llvm-mca.rst
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,16 @@ option specifies "``-``", then the output will also be sent to standard output.
no extra information, and InstrumentManager never overrides the default
schedule class for a given instruction.

.. option:: -skip-unsupported-instructions=<reason>

Force :program:`llvm-mca` to continue in the presence of instructions which do
not parse or lack key scheduling information. Note that the resulting analysis
is impacted since those unsupported instructions are ignored as-if they are
not supplied as a part of the input.

The choice of `<reason>` controls the when mca will report an error.
`<reason>` may be `none` (default), `lack-sched`, `parse-failure`, `any`.

EXIT STATUS
-----------

Expand Down
4 changes: 2 additions & 2 deletions llvm/docs/LangRef.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9113,8 +9113,8 @@ instruction in most regards. The primary difference is that it
establishes an association with additional labels to define where control
flow goes after the call.

The output values of a '``callbr``' instruction are available only to
the '``fallthrough``' block, not to any '``indirect``' blocks(s).
The output values of a '``callbr``' instruction are available both in the
the '``fallthrough``' block, and any '``indirect``' blocks(s).

The only use of this today is to implement the "goto" feature of gcc inline
assembly where additional labels can be provided as locations for the inline
Expand Down
10 changes: 10 additions & 0 deletions llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,9 @@ Changes to the AMDGPU Backend

Changes to the ARM Backend
--------------------------

* FEAT_F32MM is no longer activated by default when using `+sve` on v8.6-A or greater. The feature is still available and can be used by adding `+f32mm` to the command line options.
* armv8-r now implies only fp-armv8d16sp, rather than neon and full fp-armv8. These features are still included by default for cortex-r52. The default cpu for armv8-r is now "generic", for compatibility with variants that do not include neon, fp64, and d32.

Changes to the AVR Backend
--------------------------
Expand Down Expand Up @@ -223,6 +225,14 @@ Changes to the LLVM tools
(`#89162 <https://github.com/llvm/llvm-project/pull/89162>`_)
``--raw-relr`` has been removed.

* llvm-mca now aborts by default if it is given bad input where previously it
would continue. Additionally, it can now continue when it encounters
instructions which lack scheduling information. The behaviour can be
controlled by the newly introduced
`--skip-unsupported-instructions=<none|lack-sched|parse-failure|any>`, as
documented in `--help` output and the command guide. (`#90474
<https://github.com/llvm/llvm-project/pull/90474>`)

Changes to LLDB
---------------------------------

Expand Down
12 changes: 0 additions & 12 deletions llvm/include/llvm/Analysis/CFG.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,18 +96,6 @@ bool isPotentiallyReachableFromMany(
const SmallPtrSetImpl<BasicBlock *> *ExclusionSet,
const DominatorTree *DT = nullptr, const LoopInfo *LI = nullptr);

/// Determine whether there is a potentially a path from at least one block in
/// 'Worklist' to at least one block in 'StopSet' within a single function
/// without passing through any of the blocks in 'ExclusionSet'. Returns false
/// only if we can prove that once any block in 'Worklist' has been reached then
/// no blocks in 'StopSet' can be executed without passing through any blocks in
/// 'ExclusionSet'. Conservatively returns true.
bool isManyPotentiallyReachableFromMany(
SmallVectorImpl<BasicBlock *> &Worklist,
const SmallPtrSetImpl<const BasicBlock *> &StopSet,
const SmallPtrSetImpl<BasicBlock *> *ExclusionSet,
const DominatorTree *DT = nullptr, const LoopInfo *LI = nullptr);

/// Return true if the control flow in \p RPOTraversal is irreducible.
///
/// This is a generic implementation to detect CFG irreducibility based on loop
Expand Down
53 changes: 29 additions & 24 deletions llvm/include/llvm/Analysis/IndirectCallVisitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,31 +27,21 @@ struct PGOIndirectCallVisitor : public InstVisitor<PGOIndirectCallVisitor> {
std::vector<Instruction *> ProfiledAddresses;
PGOIndirectCallVisitor(InstructionType Type) : Type(Type) {}

void visitCallBase(CallBase &Call) {
if (!Call.isIndirectCall())
return;

if (Type == InstructionType::kIndirectCall) {
IndirectCalls.push_back(&Call);
return;
}

assert(Type == InstructionType::kVTableVal && "Control flow guaranteed");
// Given an indirect call instruction, try to find the the following pattern
//
// %vtable = load ptr, ptr %obj
// %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
// %2 = load ptr, ptr %vfn
// $call = tail call i32 %2
//
// A heuristic is used to find the address feeding instructions.
static Instruction *tryGetVTableInstruction(CallBase *CB) {
assert(CB != nullptr && "Caller guaranteed");
LoadInst *LI = dyn_cast<LoadInst>(CB->getCalledOperand());

LoadInst *LI = dyn_cast<LoadInst>(Call.getCalledOperand());
// The code pattern to look for
//
// %vtable = load ptr, ptr %b
// %vfn = getelementptr inbounds ptr, ptr %vtable, i64 1
// %2 = load ptr, ptr %vfn
// %call = tail call i32 %2(ptr %b)
//
// %vtable is the vtable address value to profile, and
// %2 is the indirect call target address to profile.
if (LI != nullptr) {
Value *Ptr = LI->getPointerOperand();
Value *VTablePtr = Ptr->stripInBoundsConstantOffsets();
// This is a heuristic to find address feeding instructions.
Value *FuncPtr = LI->getPointerOperand(); // GEP (or bitcast)
Value *VTablePtr = FuncPtr->stripInBoundsConstantOffsets();
// FIXME: Add support in the frontend so LLVM type intrinsics are
// emitted without LTO. This way, added intrinsics could filter
// non-vtable instructions and reduce instrumentation overhead.
Expand All @@ -63,7 +53,22 @@ struct PGOIndirectCallVisitor : public InstVisitor<PGOIndirectCallVisitor> {
// address is negligible if exists at all. Comparing loaded address
// with symbol address guarantees correctness.
if (VTablePtr != nullptr && isa<Instruction>(VTablePtr))
ProfiledAddresses.push_back(cast<Instruction>(VTablePtr));
return cast<Instruction>(VTablePtr);
}
return nullptr;
}

void visitCallBase(CallBase &Call) {
if (Call.isIndirectCall()) {
IndirectCalls.push_back(&Call);

if (Type != InstructionType::kVTableVal)
return;

Instruction *VPtr =
PGOIndirectCallVisitor::tryGetVTableInstruction(&Call);
if (VPtr)
ProfiledAddresses.push_back(VPtr);
}
}

Expand Down
5 changes: 5 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,11 @@ class CombinerHelper {
bool matchExtractVectorElementWithBuildVectorTrunc(const MachineOperand &MO,
BuildFnTy &MatchInfo);

/// Combine extract vector element with a shuffle vector on the vector
/// register.
bool matchExtractVectorElementWithShuffleVector(const MachineOperand &MO,
BuildFnTy &MatchInfo);

/// Combine extract vector element with a insert vector element on the vector
/// register and different indices.
bool matchExtractVectorElementWithDifferentIndices(const MachineOperand &MO,
Expand Down
12 changes: 12 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,18 @@ class GBuildVectorTrunc : public GMergeLikeInstr {
}
};

/// Represents a G_SHUFFLE_VECTOR.
class GShuffleVector : public GenericMachineInstr {
public:
Register getSrc1Reg() const { return getOperand(1).getReg(); }
Register getSrc2Reg() const { return getOperand(2).getReg(); }
ArrayRef<int> getMask() const { return getOperand(3).getShuffleMask(); }

static bool classof(const MachineInstr *MI) {
return MI->getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR;
}
};

/// Represents a G_PTR_ADD.
class GPtrAdd : public GenericMachineInstr {
public:
Expand Down
3 changes: 1 addition & 2 deletions llvm/include/llvm/IR/IntrinsicInst.h
Original file line number Diff line number Diff line change
Expand Up @@ -707,8 +707,7 @@ class VPBinOpIntrinsic : public VPIntrinsic {
/// This is the common base class for constrained floating point intrinsics.
class ConstrainedFPIntrinsic : public IntrinsicInst {
public:
bool isUnaryOp() const;
bool isTernaryOp() const;
unsigned getNonMetadataArgCount() const;
std::optional<RoundingMode> getRoundingMode() const;
std::optional<fp::ExceptionBehavior> getExceptionBehavior() const;
bool isDefaultFPEnvironment() const;
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/IR/Intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ namespace Intrinsic {
/// Floating-Point Intrinsics".
bool isConstrainedFPIntrinsic(ID QID);

/// Returns true if the intrinsic ID is for one of the "Constrained
/// Floating-Point Intrinsics" that take rounding mode metadata.
bool hasConstrainedFPRoundingModeOperand(ID QID);

/// This is a type descriptor which explains the type requirements of an
/// intrinsic. This is returned by getIntrinsicInfoTableEntries.
struct IITDescriptor {
Expand Down
8 changes: 8 additions & 0 deletions llvm/include/llvm/Target/GlobalISel/Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -1501,6 +1501,13 @@ def extract_vector_element_freeze : GICombineRule<
[{ return Helper.matchExtractVectorElementWithFreeze(${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;

def extract_vector_element_shuffle_vector : GICombineRule<
(defs root:$root, build_fn_matchinfo:$matchinfo),
(match (G_SHUFFLE_VECTOR $src, $src1, $src2, $mask),
(G_EXTRACT_VECTOR_ELT $root, $src, $idx),
[{ return Helper.matchExtractVectorElementWithShuffleVector(${root}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>;

// Combines concat operations
def concat_matchinfo : GIDefMatchData<"SmallVector<Register>">;
def combine_concat_vector : GICombineRule<
Expand Down Expand Up @@ -1578,6 +1585,7 @@ extract_vector_element_build_vector_trunc6,
extract_vector_element_build_vector_trunc7,
extract_vector_element_build_vector_trunc8,
extract_vector_element_freeze,
extract_vector_element_shuffle_vector,
insert_vector_element_extract_vector_element
]>;

Expand Down
4 changes: 2 additions & 2 deletions llvm/include/llvm/TargetParser/ARMTargetParser.def
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ ARM_ARCH("armv9.5-a", ARMV9_5A, "9.5-A", "+v9.5a", ARMBuildAttrs::CPUArch::v9_A,
ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
ARM_ARCH("armv8-r", ARMV8R, "8-R", "+v8r", ARMBuildAttrs::CPUArch::v8_R,
FK_NEON_FP_ARMV8,
FK_FPV5_SP_D16,
(ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC))
ARM_ARCH("armv8-m.base", ARMV8MBaseline, "8-M.Baseline", "+v8m.base",
Expand Down Expand Up @@ -329,7 +329,7 @@ ARM_CPU_NAME("cortex-r7", ARMV7R, FK_VFPV3_D16_FP16, false,
(ARM::AEK_MP | ARM::AEK_HWDIVARM))
ARM_CPU_NAME("cortex-r8", ARMV7R, FK_VFPV3_D16_FP16, false,
(ARM::AEK_MP | ARM::AEK_HWDIVARM))
ARM_CPU_NAME("cortex-r52", ARMV8R, FK_NEON_FP_ARMV8, true, ARM::AEK_NONE)
ARM_CPU_NAME("cortex-r52", ARMV8R, FK_NEON_FP_ARMV8, false, ARM::AEK_NONE)
ARM_CPU_NAME("sc300", ARMV7M, FK_NONE, false, ARM::AEK_NONE)
ARM_CPU_NAME("cortex-m3", ARMV7M, FK_NONE, true, ARM::AEK_NONE)
ARM_CPU_NAME("cortex-m4", ARMV7EM, FK_FPV4_SP_D16, true, ARM::AEK_NONE)
Expand Down
32 changes: 7 additions & 25 deletions llvm/lib/Analysis/CFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,21 +134,10 @@ bool llvm::isPotentiallyReachableFromMany(
SmallVectorImpl<BasicBlock *> &Worklist, const BasicBlock *StopBB,
const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
const LoopInfo *LI) {
return isManyPotentiallyReachableFromMany(
Worklist, llvm::SmallPtrSet<const BasicBlock *, 1>{StopBB}, ExclusionSet,
DT, LI);
}

bool llvm::isManyPotentiallyReachableFromMany(
SmallVectorImpl<BasicBlock *> &Worklist,
const SmallPtrSetImpl<const BasicBlock *> &StopSet,
const SmallPtrSetImpl<BasicBlock *> *ExclusionSet, const DominatorTree *DT,
const LoopInfo *LI) {
// When a stop block is unreachable, it's dominated from everywhere,
// When the stop block is unreachable, it's dominated from everywhere,
// regardless of whether there's a path between the two blocks.
llvm::DenseMap<const BasicBlock *, bool> StopBBReachable;
for (auto *BB : StopSet)
StopBBReachable[BB] = DT && DT->isReachableFromEntry(BB);
if (DT && !DT->isReachableFromEntry(StopBB))
DT = nullptr;

// We can't skip directly from a block that dominates the stop block if the
// exclusion block is potentially in between.
Expand All @@ -166,23 +155,19 @@ bool llvm::isManyPotentiallyReachableFromMany(
}
}

llvm::DenseMap<const BasicBlock *, const Loop *> StopLoops;
for (auto *StopBB : StopSet)
StopLoops[StopBB] = LI ? getOutermostLoop(LI, StopBB) : nullptr;
const Loop *StopLoop = LI ? getOutermostLoop(LI, StopBB) : nullptr;

unsigned Limit = DefaultMaxBBsToExplore;
SmallPtrSet<const BasicBlock*, 32> Visited;
do {
BasicBlock *BB = Worklist.pop_back_val();
if (!Visited.insert(BB).second)
continue;
if (StopSet.contains(BB))
if (BB == StopBB)
return true;
if (ExclusionSet && ExclusionSet->count(BB))
continue;
if (DT && llvm::any_of(StopSet, [&](const BasicBlock *StopBB) {
return StopBBReachable[BB] && DT->dominates(BB, StopBB);
}))
if (DT && DT->dominates(BB, StopBB))
return true;

const Loop *Outer = nullptr;
Expand All @@ -194,10 +179,7 @@ bool llvm::isManyPotentiallyReachableFromMany(
// excluded block. Clear Outer so we process BB's successors.
if (LoopsWithHoles.count(Outer))
Outer = nullptr;
if (llvm::any_of(StopSet, [&](const BasicBlock *StopBB) {
const Loop *StopLoop = StopLoops[StopBB];
return StopLoop && StopLoop == Outer;
}))
if (StopLoop && Outer == StopLoop)
return true;
}

Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Analysis/ValueTracking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2173,6 +2173,11 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
if (OrZero || RHSBits.One.getBoolValue() || LHSBits.One.getBoolValue())
return true;
}

// LShr(UINT_MAX, Y) + 1 is a power of two (if add is nuw) or zero.
if (OrZero || Q.IIQ.hasNoUnsignedWrap(VOBO))
if (match(I, m_Add(m_LShr(m_AllOnes(), m_Value()), m_One())))
return true;
return false;
}
case Instruction::Select:
Expand Down
106 changes: 106 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,112 @@ bool CombinerHelper::matchExtractVectorElementWithBuildVectorTrunc(
return true;
}

bool CombinerHelper::matchExtractVectorElementWithShuffleVector(
const MachineOperand &MO, BuildFnTy &MatchInfo) {
GExtractVectorElement *Extract =
cast<GExtractVectorElement>(getDefIgnoringCopies(MO.getReg(), MRI));

//
// %zero:_(s64) = G_CONSTANT i64 0
// %sv:_(<4 x s32>) = G_SHUFFLE_SHUFFLE %arg1(<4 x s32>), %arg2(<4 x s32>),
// shufflemask(0, 0, 0, 0)
// %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %zero(s64)
//
// -->
//
// %zero1:_(s64) = G_CONSTANT i64 0
// %extract:_(s32) = G_EXTRACT_VECTOR_ELT %arg1(<4 x s32>), %zero1(s64)
//
//
//
//
// %three:_(s64) = G_CONSTANT i64 3
// %sv:_(<4 x s32>) = G_SHUFFLE_SHUFFLE %arg1(<4 x s32>), %arg2(<4 x s32>),
// shufflemask(0, 0, 0, -1)
// %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %three(s64)
//
// -->
//
// %extract:_(s32) = G_IMPLICIT_DEF
//
//
//
//
//
// %sv:_(<4 x s32>) = G_SHUFFLE_SHUFFLE %arg1(<4 x s32>), %arg2(<4 x s32>),
// shufflemask(0, 0, 0, -1)
// %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %opaque(s64)
//
// -->
//
// %sv:_(<4 x s32>) = G_SHUFFLE_SHUFFLE %arg1(<4 x s32>), %arg2(<4 x s32>),
// shufflemask(0, 0, 0, -1)
// %extract:_(s32) = G_EXTRACT_VECTOR_ELT %sv(<4 x s32>), %opaque(s64)
//

// We try to get the value of the Index register.
std::optional<ValueAndVReg> MaybeIndex =
getIConstantVRegValWithLookThrough(Extract->getIndexReg(), MRI);
if (!MaybeIndex)
return false;

GShuffleVector *Shuffle =
cast<GShuffleVector>(getDefIgnoringCopies(Extract->getVectorReg(), MRI));

ArrayRef<int> Mask = Shuffle->getMask();

unsigned Offset = MaybeIndex->Value.getZExtValue();
int SrcIdx = Mask[Offset];

LLT Src1Type = MRI.getType(Shuffle->getSrc1Reg());
// At the IR level a <1 x ty> shuffle vector is valid, but we want to extract
// from a vector.
assert(Src1Type.isVector() && "expected to extract from a vector");
unsigned LHSWidth = Src1Type.isVector() ? Src1Type.getNumElements() : 1;

// Note that there is no one use check.
Register Dst = Extract->getReg(0);
LLT DstTy = MRI.getType(Dst);

if (SrcIdx < 0 &&
isLegalOrBeforeLegalizer({TargetOpcode::G_IMPLICIT_DEF, {DstTy}})) {
MatchInfo = [=](MachineIRBuilder &B) { B.buildUndef(Dst); };
return true;
}

// If the legality check failed, then we still have to abort.
if (SrcIdx < 0)
return false;

Register NewVector;

// We check in which vector and at what offset to look through.
if (SrcIdx < (int)LHSWidth) {
NewVector = Shuffle->getSrc1Reg();
// SrcIdx unchanged
} else { // SrcIdx >= LHSWidth
NewVector = Shuffle->getSrc2Reg();
SrcIdx -= LHSWidth;
}

LLT IdxTy = MRI.getType(Extract->getIndexReg());
LLT NewVectorTy = MRI.getType(NewVector);

// We check the legality of the look through.
if (!isLegalOrBeforeLegalizer(
{TargetOpcode::G_EXTRACT_VECTOR_ELT, {DstTy, NewVectorTy, IdxTy}}) ||
!isConstantLegalOrBeforeLegalizer({IdxTy}))
return false;

// We look through the shuffle vector.
MatchInfo = [=](MachineIRBuilder &B) {
auto Idx = B.buildConstant(IdxTy, SrcIdx);
B.buildExtractVectorElement(Dst, NewVector, Idx);
};

return true;
}

bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI,
BuildFnTy &MatchInfo) {
GInsertVectorElement *Insert = cast<GInsertVectorElement>(&MI);
Expand Down
7 changes: 2 additions & 5 deletions llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2053,11 +2053,8 @@ bool IRTranslator::translateConstrainedFPIntrinsic(
Flags |= MachineInstr::NoFPExcept;

SmallVector<llvm::SrcOp, 4> VRegs;
VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(0)));
if (!FPI.isUnaryOp())
VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(1)));
if (FPI.isTernaryOp())
VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(2)));
for (unsigned I = 0, E = FPI.getNonMetadataArgCount(); I != E; ++I)
VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(I)));

MIRBuilder.buildInstr(Opcode, {getOrCreateVReg(FPI)}, VRegs, Flags);
return true;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -786,6 +786,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue ScalarizeVecRes_InregOp(SDNode *N);
SDValue ScalarizeVecRes_VecInregOp(SDNode *N);

SDValue ScalarizeVecRes_ADDRSPACECAST(SDNode *N);
SDValue ScalarizeVecRes_BITCAST(SDNode *N);
SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
Expand Down Expand Up @@ -853,6 +854,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_ADDRSPACECAST(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_FFREXP(SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi);
void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
Expand Down Expand Up @@ -956,6 +958,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
// Widen Vector Result Promotion.
void WidenVectorResult(SDNode *N, unsigned ResNo);
SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo);
SDValue WidenVecRes_ADDRSPACECAST(SDNode *N);
SDValue WidenVecRes_AssertZext(SDNode* N);
SDValue WidenVecRes_BITCAST(SDNode* N);
SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
Expand Down
65 changes: 65 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TypeSize.h"
Expand Down Expand Up @@ -116,6 +117,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::FCANONICALIZE:
R = ScalarizeVecRes_UnaryOp(N);
break;
case ISD::ADDRSPACECAST:
R = ScalarizeVecRes_ADDRSPACECAST(N);
break;
case ISD::FFREXP:
R = ScalarizeVecRes_FFREXP(N, ResNo);
break;
Expand Down Expand Up @@ -475,6 +479,31 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
llvm_unreachable("Illegal extend_vector_inreg opcode");
}

SDValue DAGTypeLegalizer::ScalarizeVecRes_ADDRSPACECAST(SDNode *N) {
EVT DestVT = N->getValueType(0).getVectorElementType();
SDValue Op = N->getOperand(0);
EVT OpVT = Op.getValueType();
SDLoc DL(N);
// The result needs scalarizing, but it's not a given that the source does.
// This is a workaround for targets where it's impossible to scalarize the
// result of a conversion, because the source type is legal.
// For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
// are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
// legal and was not scalarized.
// See the similar logic in ScalarizeVecRes_SETCC
if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
Op = GetScalarizedVector(Op);
} else {
EVT VT = OpVT.getVectorElementType();
Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
DAG.getVectorIdxConstant(0, DL));
}
auto *AddrSpaceCastN = cast<AddrSpaceCastSDNode>(N);
unsigned SrcAS = AddrSpaceCastN->getSrcAddressSpace();
unsigned DestAS = AddrSpaceCastN->getDestAddressSpace();
return DAG.getAddrSpaceCast(DL, DestVT, Op, SrcAS, DestAS);
}

SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
// If the operand is wider than the vector element type then it is implicitly
// truncated. Make that explicit here.
Expand Down Expand Up @@ -1122,6 +1151,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::FCANONICALIZE:
SplitVecRes_UnaryOp(N, Lo, Hi);
break;
case ISD::ADDRSPACECAST:
SplitVecRes_ADDRSPACECAST(N, Lo, Hi);
break;
case ISD::FFREXP:
SplitVecRes_FFREXP(N, ResNo, Lo, Hi);
break;
Expand Down Expand Up @@ -2353,6 +2385,26 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
Hi = DAG.getNode(Opcode, dl, HiVT, {Hi, MaskHi, EVLHi}, Flags);
}

void DAGTypeLegalizer::SplitVecRes_ADDRSPACECAST(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDLoc dl(N);
auto [LoVT, HiVT] = DAG.GetSplitDestVTs(N->getValueType(0));

// If the input also splits, handle it directly for a compile time speedup.
// Otherwise split it by hand.
EVT InVT = N->getOperand(0).getValueType();
if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
GetSplitVector(N->getOperand(0), Lo, Hi);
else
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

auto *AddrSpaceCastN = cast<AddrSpaceCastSDNode>(N);
unsigned SrcAS = AddrSpaceCastN->getSrcAddressSpace();
unsigned DestAS = AddrSpaceCastN->getDestAddressSpace();
Lo = DAG.getAddrSpaceCast(dl, LoVT, Lo, SrcAS, DestAS);
Hi = DAG.getAddrSpaceCast(dl, HiVT, Hi, SrcAS, DestAS);
}

void DAGTypeLegalizer::SplitVecRes_FFREXP(SDNode *N, unsigned ResNo,
SDValue &Lo, SDValue &Hi) {
SDLoc dl(N);
Expand Down Expand Up @@ -4121,6 +4173,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
report_fatal_error("Do not know how to widen the result of this operator!");

case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
case ISD::ADDRSPACECAST:
Res = WidenVecRes_ADDRSPACECAST(N);
break;
case ISD::AssertZext: Res = WidenVecRes_AssertZext(N); break;
case ISD::BITCAST: Res = WidenVecRes_BITCAST(N); break;
case ISD::BUILD_VECTOR: Res = WidenVecRes_BUILD_VECTOR(N); break;
Expand Down Expand Up @@ -5086,6 +5141,16 @@ SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
return GetWidenedVector(WidenVec);
}

SDValue DAGTypeLegalizer::WidenVecRes_ADDRSPACECAST(SDNode *N) {
EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue InOp = GetWidenedVector(N->getOperand(0));
auto *AddrSpaceCastN = cast<AddrSpaceCastSDNode>(N);

return DAG.getAddrSpaceCast(SDLoc(N), WidenVT, InOp,
AddrSpaceCastN->getSrcAddressSpace(),
AddrSpaceCastN->getDestAddressSpace());
}

SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
SDValue InOp = N->getOperand(0);
EVT InVT = InOp.getValueType();
Expand Down
12 changes: 2 additions & 10 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7962,16 +7962,8 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
SDValue Chain = DAG.getRoot();
SmallVector<SDValue, 4> Opers;
Opers.push_back(Chain);
if (FPI.isUnaryOp()) {
Opers.push_back(getValue(FPI.getArgOperand(0)));
} else if (FPI.isTernaryOp()) {
Opers.push_back(getValue(FPI.getArgOperand(0)));
Opers.push_back(getValue(FPI.getArgOperand(1)));
Opers.push_back(getValue(FPI.getArgOperand(2)));
} else {
Opers.push_back(getValue(FPI.getArgOperand(0)));
Opers.push_back(getValue(FPI.getArgOperand(1)));
}
for (unsigned I = 0, E = FPI.getNonMetadataArgCount(); I != E; ++I)
Opers.push_back(getValue(FPI.getArgOperand(I)));

auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) {
assert(Result.getNode()->getNumValues() == 2);
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/IR/Function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1493,7 +1493,19 @@ bool Intrinsic::isConstrainedFPIntrinsic(ID QID) {
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC:
#include "llvm/IR/ConstrainedOps.def"
#undef INSTRUCTION
return true;
default:
return false;
}
}

bool Intrinsic::hasConstrainedFPRoundingModeOperand(Intrinsic::ID QID) {
switch (QID) {
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC: \
return ROUND_MODE == 1;
#include "llvm/IR/ConstrainedOps.def"
#undef INSTRUCTION
default:
return false;
Expand Down
25 changes: 3 additions & 22 deletions llvm/lib/IR/IRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1029,17 +1029,7 @@ CallInst *IRBuilderBase::CreateConstrainedFPCast(
UseFMF = FMFSource->getFastMathFlags();

CallInst *C;
bool HasRoundingMD = false;
switch (ID) {
default:
break;
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC: \
HasRoundingMD = ROUND_MODE; \
break;
#include "llvm/IR/ConstrainedOps.def"
}
if (HasRoundingMD) {
if (Intrinsic::hasConstrainedFPRoundingModeOperand(ID)) {
Value *RoundingV = getConstrainedFPRounding(Rounding);
C = CreateIntrinsic(ID, {DestTy, V->getType()}, {V, RoundingV, ExceptV},
nullptr, Name);
Expand Down Expand Up @@ -1088,17 +1078,8 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
llvm::SmallVector<Value *, 6> UseArgs;

append_range(UseArgs, Args);
bool HasRoundingMD = false;
switch (Callee->getIntrinsicID()) {
default:
break;
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC: \
HasRoundingMD = ROUND_MODE; \
break;
#include "llvm/IR/ConstrainedOps.def"
}
if (HasRoundingMD)

if (Intrinsic::hasConstrainedFPRoundingModeOperand(Callee->getIntrinsicID()))
UseArgs.push_back(getConstrainedFPRounding(Rounding));
UseArgs.push_back(getConstrainedFPExcept(Except));

Expand Down
40 changes: 13 additions & 27 deletions llvm/lib/IR/IntrinsicInst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,37 +365,23 @@ FCmpInst::Predicate ConstrainedFPCmpIntrinsic::getPredicate() const {
return getFPPredicateFromMD(getArgOperand(2));
}

bool ConstrainedFPIntrinsic::isUnaryOp() const {
switch (getIntrinsicID()) {
default:
return false;
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC: \
return NARG == 1;
#include "llvm/IR/ConstrainedOps.def"
}
}
unsigned ConstrainedFPIntrinsic::getNonMetadataArgCount() const {
// All constrained fp intrinsics have "fpexcept" metadata.
unsigned NumArgs = arg_size() - 1;

bool ConstrainedFPIntrinsic::isTernaryOp() const {
switch (getIntrinsicID()) {
default:
return false;
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC: \
return NARG == 3;
#include "llvm/IR/ConstrainedOps.def"
}
// Some intrinsics have "round" metadata.
if (Intrinsic::hasConstrainedFPRoundingModeOperand(getIntrinsicID()))
NumArgs -= 1;

// Compare intrinsics take their predicate as metadata.
if (isa<ConstrainedFPCmpIntrinsic>(this))
NumArgs -= 1;

return NumArgs;
}

bool ConstrainedFPIntrinsic::classof(const IntrinsicInst *I) {
switch (I->getIntrinsicID()) {
#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC:
#include "llvm/IR/ConstrainedOps.def"
return true;
default:
return false;
}
return Intrinsic::isConstrainedFPIntrinsic(I->getIntrinsicID());
}

ElementCount VPIntrinsic::getStaticVectorLength() const {
Expand Down
29 changes: 13 additions & 16 deletions llvm/lib/IR/Verifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5384,11 +5384,13 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
}
#define BEGIN_REGISTER_VP_INTRINSIC(VPID, ...) case Intrinsic::VPID:
#include "llvm/IR/VPIntrinsics.def"
#undef BEGIN_REGISTER_VP_INTRINSIC
visitVPIntrinsic(cast<VPIntrinsic>(Call));
break;
#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC:
#include "llvm/IR/ConstrainedOps.def"
#undef INSTRUCTION
visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(Call));
break;
case Intrinsic::dbg_declare: // llvm.dbg.declare
Expand Down Expand Up @@ -6527,19 +6529,13 @@ void Verifier::visitVPIntrinsic(VPIntrinsic &VPI) {
}

void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
unsigned NumOperands;
bool HasRoundingMD;
switch (FPI.getIntrinsicID()) {
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC: \
NumOperands = NARG; \
HasRoundingMD = ROUND_MODE; \
break;
#include "llvm/IR/ConstrainedOps.def"
default:
llvm_unreachable("Invalid constrained FP intrinsic!");
}
unsigned NumOperands = FPI.getNonMetadataArgCount();
bool HasRoundingMD =
Intrinsic::hasConstrainedFPRoundingModeOperand(FPI.getIntrinsicID());

// Add the expected number of metadata operands.
NumOperands += (1 + HasRoundingMD);

// Compare intrinsics carry an extra predicate metadata operand.
if (isa<ConstrainedFPCmpIntrinsic>(FPI))
NumOperands += 1;
Expand All @@ -6553,8 +6549,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
Type *ResultTy = FPI.getType();
Check(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
"Intrinsic does not support vectors", &FPI);
}
break;
}

case Intrinsic::experimental_constrained_lround:
case Intrinsic::experimental_constrained_llround: {
Expand Down Expand Up @@ -6593,8 +6589,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
"Intrinsic first argument and result vector lengths must be equal",
&FPI);
}
}
break;
}

case Intrinsic::experimental_constrained_sitofp:
case Intrinsic::experimental_constrained_uitofp: {
Expand All @@ -6616,7 +6612,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
"Intrinsic first argument and result vector lengths must be equal",
&FPI);
}
} break;
break;
}

case Intrinsic::experimental_constrained_fptrunc:
case Intrinsic::experimental_constrained_fpext: {
Expand Down Expand Up @@ -6645,8 +6642,8 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
"Intrinsic first argument's type must be smaller than result type",
&FPI);
}
}
break;
}

default:
break;
Expand Down
8 changes: 6 additions & 2 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7436,7 +7436,8 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
Op.IsDefined = true;
Op.Loc = getLoc();
if (isToken(AsmToken::Identifier) &&
(Op.Val = getMsgOpId(Msg.Val, getTokenStr())) >= 0) {
(Op.Val = getMsgOpId(Msg.Val, getTokenStr(), getSTI())) !=
OPR_ID_UNKNOWN) {
lex(); // skip operation name
} else if (!parseExpr(Op.Val, "an operation name")) {
return false;
Expand Down Expand Up @@ -7484,7 +7485,10 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
return false;
}
if (!isValidMsgOp(Msg.Val, Op.Val, getSTI(), Strict)) {
Error(Op.Loc, "invalid operation id");
if (Op.Val == OPR_ID_UNSUPPORTED)
Error(Op.Loc, "specified operation id is not supported on this GPU");
else
Error(Op.Loc, "invalid operation id");
return false;
}
if (Strict && !msgSupportsStream(Msg.Val, Op.Val, getSTI()) &&
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -468,7 +468,6 @@ enum Id { // Message ID, width(4) [3:0].
};

enum Op { // Both GS and SYS operation IDs.
OP_UNKNOWN_ = -1,
OP_SHIFT_ = 4,
OP_NONE_ = 0,
// Bits used for operation encoding
Expand All @@ -479,14 +478,12 @@ enum Op { // Both GS and SYS operation IDs.
OP_GS_CUT = 1,
OP_GS_EMIT = 2,
OP_GS_EMIT_CUT = 3,
OP_GS_LAST_,
OP_GS_FIRST_ = OP_GS_NOP,
// SYS operations are encoded in bits 6:4
OP_SYS_ECC_ERR_INTERRUPT = 1,
OP_SYS_REG_RD = 2,
OP_SYS_HOST_TRAP_ACK = 3,
OP_SYS_TTRACE_PC = 4,
OP_SYS_LAST_,
OP_SYS_FIRST_ = OP_SYS_ECC_ERR_INTERRUPT,
};

Expand Down
122 changes: 101 additions & 21 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,60 @@
namespace llvm {
namespace AMDGPU {

//===----------------------------------------------------------------------===//
// Custom Operands.
//
// A table of custom operands shall describe "primary" operand names first
// followed by aliases if any. It is not required but recommended to arrange
// operands so that operand encoding match operand position in the table. This
// will make getNameFromOperandTable() a bit more efficient. Unused slots in the
// table shall have an empty name.
//
//===----------------------------------------------------------------------===//

/// Map from the encoding of a sendmsg/hwreg asm operand to it's name.
template <size_t N>
static StringRef getNameFromOperandTable(const CustomOperand (&Table)[N],
unsigned Encoding,
const MCSubtargetInfo &STI) {
auto isValidIndexForEncoding = [&](size_t Idx) {
return Idx < N && Table[Idx].Encoding == Encoding &&
!Table[Idx].Name.empty() &&
(!Table[Idx].Cond || Table[Idx].Cond(STI));
};

// This is an optimization that should work in most cases. As a side effect,
// it may cause selection of an alias instead of a primary operand name in
// case of sparse tables.
if (isValidIndexForEncoding(Encoding))
return Table[Encoding].Name;

for (size_t Idx = 0; Idx != N; ++Idx)
if (isValidIndexForEncoding(Idx))
return Table[Idx].Name;

return "";
}

/// Map from a symbolic name for a sendmsg/hwreg asm operand to it's encoding.
template <size_t N>
static int64_t getEncodingFromOperandTable(const CustomOperand (&Table)[N],
StringRef Name,
const MCSubtargetInfo &STI) {
int64_t InvalidEncoding = OPR_ID_UNKNOWN;
for (const CustomOperand &Entry : Table) {
if (Entry.Name != Name)
continue;

if (!Entry.Cond || Entry.Cond(STI))
return Entry.Encoding;

InvalidEncoding = OPR_ID_UNSUPPORTED;
}

return InvalidEncoding;
}

namespace DepCtr {

// NOLINTBEGIN
Expand All @@ -34,10 +88,11 @@ const int DEP_CTR_SIZE =

namespace SendMsg {

// Disable lint checking for this block since it makes the table unreadable.
// Disable lint checking here since it makes these tables unreadable.
// NOLINTBEGIN
// clang-format off
const CustomOperand<const MCSubtargetInfo &> Msg[] = {

static constexpr CustomOperand MsgOperands[] = {
{{""}},
{{"MSG_INTERRUPT"}, ID_INTERRUPT},
{{"MSG_GS"}, ID_GS_PreGFX11, isNotGFX11Plus},
Expand All @@ -63,27 +118,47 @@ const CustomOperand<const MCSubtargetInfo &> Msg[] = {
{{"MSG_RTN_GET_TBA_TO_PC"}, ID_RTN_GET_TBA_TO_PC, isGFX11Plus},
{{"MSG_RTN_GET_SE_AID_ID"}, ID_RTN_GET_SE_AID_ID, isGFX12Plus},
};

static constexpr CustomOperand SysMsgOperands[] = {
{{""}},
{{"SYSMSG_OP_ECC_ERR_INTERRUPT"}, OP_SYS_ECC_ERR_INTERRUPT},
{{"SYSMSG_OP_REG_RD"}, OP_SYS_REG_RD},
{{"SYSMSG_OP_HOST_TRAP_ACK"}, OP_SYS_HOST_TRAP_ACK, isNotGFX9Plus},
{{"SYSMSG_OP_TTRACE_PC"}, OP_SYS_TTRACE_PC},
};

static constexpr CustomOperand StreamMsgOperands[] = {
{{"GS_OP_NOP"}, OP_GS_NOP},
{{"GS_OP_CUT"}, OP_GS_CUT},
{{"GS_OP_EMIT"}, OP_GS_EMIT},
{{"GS_OP_EMIT_CUT"}, OP_GS_EMIT_CUT},
};

// clang-format on
// NOLINTEND

const int MSG_SIZE = static_cast<int>(
sizeof(Msg) / sizeof(CustomOperand<const MCSubtargetInfo &>));
int64_t getMsgId(StringRef Name, const MCSubtargetInfo &STI) {
return getEncodingFromOperandTable(MsgOperands, Name, STI);
}

// These two must be in sync with llvm::AMDGPU::SendMsg::Op enum members, see SIDefines.h.
const char *const OpSysSymbolic[OP_SYS_LAST_] = {
nullptr,
"SYSMSG_OP_ECC_ERR_INTERRUPT",
"SYSMSG_OP_REG_RD",
"SYSMSG_OP_HOST_TRAP_ACK",
"SYSMSG_OP_TTRACE_PC"
};
StringRef getMsgName(uint64_t Encoding, const MCSubtargetInfo &STI) {
return getNameFromOperandTable(MsgOperands, Encoding, STI);
}

const char *const OpGsSymbolic[OP_GS_LAST_] = {
"GS_OP_NOP",
"GS_OP_CUT",
"GS_OP_EMIT",
"GS_OP_EMIT_CUT"
};
int64_t getMsgOpId(int64_t MsgId, StringRef Name, const MCSubtargetInfo &STI) {
if (MsgId == ID_SYSMSG)
return getEncodingFromOperandTable(SysMsgOperands, Name, STI);
return getEncodingFromOperandTable(StreamMsgOperands, Name, STI);
}

StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding,
const MCSubtargetInfo &STI) {
assert(msgRequiresOp(MsgId, STI) && "must have an operand");

if (MsgId == ID_SYSMSG)
return getNameFromOperandTable(SysMsgOperands, Encoding, STI);
return getNameFromOperandTable(StreamMsgOperands, Encoding, STI);
}

} // namespace SendMsg

Expand All @@ -92,7 +167,7 @@ namespace Hwreg {
// Disable lint checking for this block since it makes the table unreadable.
// NOLINTBEGIN
// clang-format off
const CustomOperand<const MCSubtargetInfo &> Opr[] = {
static constexpr CustomOperand Operands[] = {
{{""}},
{{"HW_REG_MODE"}, ID_MODE},
{{"HW_REG_STATUS"}, ID_STATUS},
Expand Down Expand Up @@ -155,8 +230,13 @@ const CustomOperand<const MCSubtargetInfo &> Opr[] = {
// clang-format on
// NOLINTEND

const int OPR_SIZE = static_cast<int>(
sizeof(Opr) / sizeof(CustomOperand<const MCSubtargetInfo &>));
int64_t getHwregId(StringRef Name, const MCSubtargetInfo &STI) {
return getEncodingFromOperandTable(Operands, Name, STI);
}

StringRef getHwreg(uint64_t Encoding, const MCSubtargetInfo &STI) {
return getNameFromOperandTable(Operands, Encoding, STI);
}

} // namespace Hwreg

Expand Down
34 changes: 24 additions & 10 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ const int OPR_ID_UNSUPPORTED = -2;
const int OPR_ID_DUPLICATE = -3;
const int OPR_VAL_INVALID = -4;

template <class T> struct CustomOperand {
struct CustomOperand {
StringLiteral Name;
int Encoding = 0;
bool (*Cond)(T Context) = nullptr;
unsigned Encoding = 0;
bool (*Cond)(const MCSubtargetInfo &STI) = nullptr;
};

struct CustomOperandVal {
Expand Down Expand Up @@ -60,20 +60,34 @@ extern const int DEP_CTR_SIZE;

} // namespace DepCtr

namespace SendMsg { // Symbolic names for the sendmsg(...) syntax.
// Symbolic names for the sendmsg(msg_id, operation, stream) syntax.
namespace SendMsg {

/// Map from a symbolic name for a msg_id to the message portion of the
/// immediate encoding. A negative return value indicates that the Name was
/// unknown or unsupported on this target.
int64_t getMsgId(StringRef Name, const MCSubtargetInfo &STI);

/// Map from an encoding to the symbolic name for a msg_id immediate. This is
/// doing opposite of getMsgId().
StringRef getMsgName(uint64_t Encoding, const MCSubtargetInfo &STI);

extern const CustomOperand<const MCSubtargetInfo &> Msg[];
extern const int MSG_SIZE;
/// Map from a symbolic name for a sendmsg operation to the operation portion of
/// the immediate encoding. A negative return value indicates that the Name was
/// unknown or unsupported on this target.
int64_t getMsgOpId(int64_t MsgId, StringRef Name, const MCSubtargetInfo &STI);

extern const char *const OpSysSymbolic[OP_SYS_LAST_];
extern const char *const OpGsSymbolic[OP_GS_LAST_];
/// Map from an encoding to the symbolic name for a sendmsg operation. This is
/// doing opposite of getMsgOpId().
StringRef getMsgOpName(int64_t MsgId, uint64_t Encoding,
const MCSubtargetInfo &STI);

} // namespace SendMsg

namespace Hwreg { // Symbolic names for the hwreg(...) syntax.

extern const CustomOperand<const MCSubtargetInfo &> Opr[];
extern const int OPR_SIZE;
int64_t getHwregId(StringRef Name, const MCSubtargetInfo &STI);
StringRef getHwreg(uint64_t Encoding, const MCSubtargetInfo &STI);

} // namespace Hwreg

Expand Down
119 changes: 8 additions & 111 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1495,62 +1495,6 @@ unsigned encodeStorecntDscnt(const IsaVersion &Version,
return encodeStorecntDscnt(Version, Decoded.StoreCnt, Decoded.DsCnt);
}

//===----------------------------------------------------------------------===//
// Custom Operands.
//
// A table of custom operands shall describe "primary" operand names
// first followed by aliases if any. It is not required but recommended
// to arrange operands so that operand encoding match operand position
// in the table. This will make disassembly a bit more efficient.
// Unused slots in the table shall have an empty name.
//
//===----------------------------------------------------------------------===//

template <class T>
static bool isValidOpr(int Idx, const CustomOperand<T> OpInfo[], int OpInfoSize,
T Context) {
return 0 <= Idx && Idx < OpInfoSize && !OpInfo[Idx].Name.empty() &&
(!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context));
}

template <class T>
static int getOprIdx(std::function<bool(const CustomOperand<T> &)> Test,
const CustomOperand<T> OpInfo[], int OpInfoSize,
T Context) {
int InvalidIdx = OPR_ID_UNKNOWN;
for (int Idx = 0; Idx < OpInfoSize; ++Idx) {
if (Test(OpInfo[Idx])) {
if (!OpInfo[Idx].Cond || OpInfo[Idx].Cond(Context))
return Idx;
InvalidIdx = OPR_ID_UNSUPPORTED;
}
}
return InvalidIdx;
}

template <class T>
static int getOprIdx(const StringRef Name, const CustomOperand<T> OpInfo[],
int OpInfoSize, T Context) {
auto Test = [=](const CustomOperand<T> &Op) { return Op.Name == Name; };
return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context);
}

template <class T>
static int getOprIdx(int Id, const CustomOperand<T> OpInfo[], int OpInfoSize,
T Context, bool QuickCheck = true) {
auto Test = [=](const CustomOperand<T> &Op) {
return Op.Encoding == Id && !Op.Name.empty();
};
// This is an optimization that should work in most cases.
// As a side effect, it may cause selection of an alias
// instead of a primary operand name in case of sparse tables.
if (QuickCheck && isValidOpr<T>(Id, OpInfo, OpInfoSize, Context) &&
OpInfo[Id].Encoding == Id) {
return Id;
}
return getOprIdx<T>(Test, OpInfo, OpInfoSize, Context);
}

//===----------------------------------------------------------------------===//
// Custom Operand Values
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1701,24 +1645,6 @@ unsigned encodeFieldSaSdst(unsigned SaSdst) {

} // namespace DepCtr

//===----------------------------------------------------------------------===//
// hwreg
//===----------------------------------------------------------------------===//

namespace Hwreg {

int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Opr, OPR_SIZE, STI);
return (Idx < 0) ? Idx : Opr[Idx].Encoding;
}

StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
int Idx = getOprIdx<const MCSubtargetInfo &>(Id, Opr, OPR_SIZE, STI);
return (Idx < 0) ? "" : Opr[Idx].Name;
}

} // namespace Hwreg

//===----------------------------------------------------------------------===//
// exp tgt
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -1919,56 +1845,25 @@ static uint64_t getMsgIdMask(const MCSubtargetInfo &STI) {
return isGFX11Plus(STI) ? ID_MASK_GFX11Plus_ : ID_MASK_PreGFX11_;
}

int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI) {
int Idx = getOprIdx<const MCSubtargetInfo &>(Name, Msg, MSG_SIZE, STI);
return (Idx < 0) ? Idx : Msg[Idx].Encoding;
}

bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI) {
return (MsgId & ~(getMsgIdMask(STI))) == 0;
}

StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI) {
int Idx = getOprIdx<const MCSubtargetInfo &>(MsgId, Msg, MSG_SIZE, STI);
return (Idx < 0) ? "" : Msg[Idx].Name;
}

int64_t getMsgOpId(int64_t MsgId, const StringRef Name) {
const char* const *S = (MsgId == ID_SYSMSG) ? OpSysSymbolic : OpGsSymbolic;
const int F = (MsgId == ID_SYSMSG) ? OP_SYS_FIRST_ : OP_GS_FIRST_;
const int L = (MsgId == ID_SYSMSG) ? OP_SYS_LAST_ : OP_GS_LAST_;
for (int i = F; i < L; ++i) {
if (Name == S[i]) {
return i;
}
}
return OP_UNKNOWN_;
}

bool isValidMsgOp(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI,
bool Strict) {
assert(isValidMsgId(MsgId, STI));

if (!Strict)
return 0 <= OpId && isUInt<OP_WIDTH_>(OpId);

if (MsgId == ID_SYSMSG)
return OP_SYS_FIRST_ <= OpId && OpId < OP_SYS_LAST_;
if (!isGFX11Plus(STI)) {
switch (MsgId) {
case ID_GS_PreGFX11:
return (OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_) && OpId != OP_GS_NOP;
case ID_GS_DONE_PreGFX11:
return OP_GS_FIRST_ <= OpId && OpId < OP_GS_LAST_;
}
if (msgRequiresOp(MsgId, STI)) {
if (MsgId == ID_GS_PreGFX11 && OpId == OP_GS_NOP)
return false;

return !getMsgOpName(MsgId, OpId, STI).empty();
}
return OpId == OP_NONE_;
}

StringRef getMsgOpName(int64_t MsgId, int64_t OpId,
const MCSubtargetInfo &STI) {
assert(msgRequiresOp(MsgId, STI));
return (MsgId == ID_SYSMSG)? OpSysSymbolic[OpId] : OpGsSymbolic[OpId];
return OpId == OP_NONE_;
}

bool isValidMsgStream(int64_t MsgId, int64_t OpId, int64_t StreamId,
Expand Down Expand Up @@ -2186,6 +2081,8 @@ bool isGFX9Plus(const MCSubtargetInfo &STI) {
return isGFX9(STI) || isGFX10Plus(STI);
}

bool isNotGFX9Plus(const MCSubtargetInfo &STI) { return !isGFX9Plus(STI); }

bool isGFX10(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureGFX10);
}
Expand Down
19 changes: 1 addition & 18 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1078,12 +1078,6 @@ struct HwregSize : EncodingField<15, 11, 32> {

using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>;

LLVM_READONLY
int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);

LLVM_READNONE
StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);

} // namespace Hwreg

namespace DepCtr {
Expand Down Expand Up @@ -1173,18 +1167,6 @@ unsigned getDefaultFormatEncoding(const MCSubtargetInfo &STI);

namespace SendMsg {

LLVM_READONLY
int64_t getMsgId(const StringRef Name, const MCSubtargetInfo &STI);

LLVM_READONLY
int64_t getMsgOpId(int64_t MsgId, const StringRef Name);

LLVM_READNONE
StringRef getMsgName(int64_t MsgId, const MCSubtargetInfo &STI);

LLVM_READNONE
StringRef getMsgOpName(int64_t MsgId, int64_t OpId, const MCSubtargetInfo &STI);

LLVM_READNONE
bool isValidMsgId(int64_t MsgId, const MCSubtargetInfo &STI);

Expand Down Expand Up @@ -1276,6 +1258,7 @@ bool isGFX9_GFX10_GFX11(const MCSubtargetInfo &STI);
bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI);
bool isGFX8Plus(const MCSubtargetInfo &STI);
bool isGFX9Plus(const MCSubtargetInfo &STI);
bool isNotGFX9Plus(const MCSubtargetInfo &STI);
bool isGFX10(const MCSubtargetInfo &STI);
bool isGFX10_GFX11(const MCSubtargetInfo &STI);
bool isGFX10Plus(const MCSubtargetInfo &STI);
Expand Down
5 changes: 2 additions & 3 deletions llvm/lib/Target/ARM/ARMArchitectures.td
Original file line number Diff line number Diff line change
Expand Up @@ -293,9 +293,8 @@ def ARMv8r : Architecture<"armv8-r", "ARMv8r", [HasV8Ops,
FeatureDSP,
FeatureCRC,
FeatureMP,
FeatureVirtualization,
FeatureFPARMv8,
FeatureNEON]>;
FeatureFPARMv8_D16_SP,
FeatureVirtualization]>;

def ARMv8mBaseline : Architecture<"armv8-m.base", "ARMv8mBaseline",
[HasV8MBaselineOps,
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/ARM/ARMProcessors.td
Original file line number Diff line number Diff line change
Expand Up @@ -573,5 +573,7 @@ def : ProcNoItin<"kryo", [ARMv8a, ProcKryo,
FeatureCRC]>;

def : ProcessorModel<"cortex-r52", CortexR52Model, [ARMv8r, ProcR52,
FeatureFPARMv8,
FeatureNEON,
FeatureUseMISched,
FeatureFPAO]>;
337 changes: 90 additions & 247 deletions llvm/lib/Target/AVR/AVRInstrInfo.td

Large diffs are not rendered by default.

17 changes: 9 additions & 8 deletions llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1671,10 +1671,9 @@ static LoongArchISD::NodeType getLoongArchWOpcode(unsigned Opcode) {
return LoongArchISD::SRA_W;
case ISD::SRL:
return LoongArchISD::SRL_W;
case ISD::ROTL:
case ISD::ROTR:
return LoongArchISD::ROTR_W;
case ISD::ROTL:
return LoongArchISD::ROTL_W;
case ISD::CTTZ:
return LoongArchISD::CTZ_W;
case ISD::CTLZ:
Expand Down Expand Up @@ -1704,6 +1703,10 @@ static SDValue customLegalizeToWOp(SDNode *N, SelectionDAG &DAG, int NumOp,
case 2: {
NewOp0 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(0));
SDValue NewOp1 = DAG.getNode(ExtOpc, DL, MVT::i64, N->getOperand(1));
if (N->getOpcode() == ISD::ROTL) {
SDValue TmpOp = DAG.getConstant(32, DL, MVT::i64);
NewOp1 = DAG.getNode(ISD::SUB, DL, MVT::i64, TmpOp, NewOp1);
}
NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp0, NewOp1);
break;
}
Expand Down Expand Up @@ -1841,7 +1844,6 @@ void LoongArchTargetLowering::ReplaceNodeResults(
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
case ISD::ROTR:
assert(VT == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
if (N->getOperand(1).getOpcode() != ISD::Constant) {
Expand All @@ -1850,11 +1852,10 @@ void LoongArchTargetLowering::ReplaceNodeResults(
}
break;
case ISD::ROTL:
ConstantSDNode *CN;
if ((CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))) {
Results.push_back(customLegalizeToWOp(N, DAG, 2));
break;
}
case ISD::ROTR:
assert(VT == MVT::i32 && Subtarget.is64Bit() &&
"Unexpected custom legalisation");
Results.push_back(customLegalizeToWOp(N, DAG, 2));
break;
case ISD::FP_TO_SINT: {
assert(VT == MVT::i32 && Subtarget.is64Bit() &&
Expand Down
7 changes: 2 additions & 5 deletions llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>;
def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>;
def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>;
def loongarch_rotr_w : SDNode<"LoongArchISD::ROTR_W", SDT_LoongArchIntBinOpW>;
def loongarch_rotl_w : SDNode<"LoongArchISD::ROTL_W", SDT_LoongArchIntBinOpW>;
def loongarch_crc_w_b_w
: SDNode<"LoongArchISD::CRC_W_B_W", SDT_LoongArchIntBinOpW, [SDNPHasChain]>;
def loongarch_crc_w_h_w
Expand Down Expand Up @@ -1116,12 +1115,10 @@ def : PatGprGpr<srem, MOD_D>;
def : PatGprGpr<urem, MOD_DU>;
def : PatGprGpr<rotr, ROTR_D>;
def : PatGprGpr<loongarch_rotr_w, ROTR_W>;
def : PatGprGpr_32<rotr, ROTR_W>;
def : PatGprImm<rotr, ROTRI_D, uimm6>;
def : PatGprImm_32<rotr, ROTRI_W, uimm5>;
def : Pat<(loongarch_rotl_w GPR:$rj, uimm5:$imm),
(ROTRI_W GPR:$rj, (ImmSubFrom32 uimm5:$imm))>;
def : Pat<(sext_inreg (loongarch_rotl_w GPR:$rj, uimm5:$imm), i32),
(ROTRI_W GPR:$rj, (ImmSubFrom32 uimm5:$imm))>;
def : PatGprImm<loongarch_rotr_w, ROTRI_W, uimm5>;
// TODO: Select "_W[U]" instructions for i32xi32 if only lower 32 bits of the
// product are used.
def : PatGprGpr<mul, MUL_D>;
Expand Down
50 changes: 45 additions & 5 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,23 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
}
}

// TODO: Could we merge some code with zvfhmin?
if (Subtarget.hasVInstructionsBF16()) {
for (MVT VT : BF16VecVTs) {
if (!isTypeLegal(VT))
continue;
setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
Custom);
setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
ISD::EXTRACT_SUBVECTOR},
VT, Custom);
setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
// TODO: Promote to fp32.
}
}

if (Subtarget.hasVInstructionsF32()) {
for (MVT VT : F32VecVTs) {
if (!isTypeLegal(VT))
Expand Down Expand Up @@ -1302,6 +1319,19 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
continue;
}

if (VT.getVectorElementType() == MVT::bf16) {
setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
setOperationAction({ISD::STRICT_FP_ROUND, ISD::STRICT_FP_EXTEND}, VT,
Custom);
setOperationAction({ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR,
ISD::EXTRACT_SUBVECTOR},
VT, Custom);
setOperationAction({ISD::LOAD, ISD::STORE}, VT, Custom);
// TODO: Promote to fp32.
continue;
}

// We use EXTRACT_SUBVECTOR as a "cast" from scalable to fixed.
setOperationAction({ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT,
Custom);
Expand Down Expand Up @@ -2561,6 +2591,10 @@ static bool useRVVForFixedLengthVectorVT(MVT VT,
if (!Subtarget.hasVInstructionsF16Minimal())
return false;
break;
case MVT::bf16:
if (!Subtarget.hasVInstructionsBF16())
return false;
break;
case MVT::f32:
if (!Subtarget.hasVInstructionsF32())
return false;
Expand Down Expand Up @@ -2612,6 +2646,7 @@ static MVT getContainerForFixedLengthVector(const TargetLowering &TLI, MVT VT,
case MVT::i16:
case MVT::i32:
case MVT::i64:
case MVT::bf16:
case MVT::f16:
case MVT::f32:
case MVT::f64: {
Expand Down Expand Up @@ -8101,8 +8136,10 @@ RISCVTargetLowering::lowerStrictFPExtendOrRoundLike(SDValue Op,

// RVV can only widen/truncate fp to types double/half the size as the source.
if ((VT.getVectorElementType() == MVT::f64 &&
SrcVT.getVectorElementType() == MVT::f16) ||
(VT.getVectorElementType() == MVT::f16 &&
(SrcVT.getVectorElementType() == MVT::f16 ||
SrcVT.getVectorElementType() == MVT::bf16)) ||
((VT.getVectorElementType() == MVT::f16 ||
VT.getVectorElementType() == MVT::bf16) &&
SrcVT.getVectorElementType() == MVT::f64)) {
// For double rounding, the intermediate rounding should be round-to-odd.
unsigned InterConvOpc = Op.getOpcode() == ISD::STRICT_FP_EXTEND
Expand Down Expand Up @@ -8146,9 +8183,12 @@ RISCVTargetLowering::lowerVectorFPExtendOrRoundLike(SDValue Op,
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();

bool IsDirectExtend = IsExtend && (VT.getVectorElementType() != MVT::f64 ||
SrcVT.getVectorElementType() != MVT::f16);
bool IsDirectTrunc = !IsExtend && (VT.getVectorElementType() != MVT::f16 ||
bool IsDirectExtend =
IsExtend && (VT.getVectorElementType() != MVT::f64 ||
(SrcVT.getVectorElementType() != MVT::f16 &&
SrcVT.getVectorElementType() != MVT::bf16));
bool IsDirectTrunc = !IsExtend && ((VT.getVectorElementType() != MVT::f16 &&
VT.getVectorElementType() != MVT::bf16) ||
SrcVT.getVectorElementType() != MVT::f64);

bool IsDirectConv = IsDirectExtend || IsDirectTrunc;
Expand Down
32 changes: 16 additions & 16 deletions llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
Original file line number Diff line number Diff line change
Expand Up @@ -355,24 +355,24 @@ defset list<VTypeInfo> AllVectors = {
V_M8, f64, FPR64>;
}
}
}

defset list<VTypeInfo> AllBFloatVectors = {
defset list<VTypeInfo> NoGroupBFloatVectors = {
defset list<VTypeInfo> FractionalGroupBFloatVectors = {
def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, V_MF4, bf16, FPR16>;
def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, V_MF2, bf16, FPR16>;
defset list<VTypeInfo> AllBFloatVectors = {
defset list<VTypeInfo> NoGroupBFloatVectors = {
defset list<VTypeInfo> FractionalGroupBFloatVectors = {
def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, V_MF4, bf16, FPR16>;
def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, V_MF2, bf16, FPR16>;
}
def VBF16M1: VTypeInfo<vbfloat16m1_t, vbool16_t, 16, V_M1, bf16, FPR16>;
}

defset list<GroupVTypeInfo> GroupBFloatVectors = {
def VBF16M2: GroupVTypeInfo<vbfloat16m2_t, vbfloat16m1_t, vbool8_t, 16,
V_M2, bf16, FPR16>;
def VBF16M4: GroupVTypeInfo<vbfloat16m4_t, vbfloat16m1_t, vbool4_t, 16,
V_M4, bf16, FPR16>;
def VBF16M8: GroupVTypeInfo<vbfloat16m8_t, vbfloat16m1_t, vbool2_t, 16,
V_M8, bf16, FPR16>;
}
def VBF16M1: VTypeInfo<vbfloat16m1_t, vbool16_t, 16, V_M1, bf16, FPR16>;
}

defset list<GroupVTypeInfo> GroupBFloatVectors = {
def VBF16M2: GroupVTypeInfo<vbfloat16m2_t, vbfloat16m1_t, vbool8_t, 16,
V_M2, bf16, FPR16>;
def VBF16M4: GroupVTypeInfo<vbfloat16m4_t, vbfloat16m1_t, vbool4_t, 16,
V_M4, bf16, FPR16>;
def VBF16M8: GroupVTypeInfo<vbfloat16m8_t, vbfloat16m1_t, vbool2_t, 16,
V_M8, bf16, FPR16>;
}
}

Expand Down
14 changes: 14 additions & 0 deletions llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
Original file line number Diff line number Diff line change
Expand Up @@ -1495,6 +1495,20 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
fvti.AVL, fvti.Log2SEW, TA_MA)>;
}

foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
defvar fvti = fvtiToFWti.Vti;
defvar fwti = fvtiToFWti.Wti;
let Predicates = [HasVInstructionsBF16] in
def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
(!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
(fvti.Vector (IMPLICIT_DEF)),
fwti.RegClass:$rs1,
// Value to indicate no rounding mode change in
// RISCVInsertReadWriteCSR
FRM_DYN,
fvti.AVL, fvti.Log2SEW, TA_MA)>;
}

//===----------------------------------------------------------------------===//
// Vector Splats
//===----------------------------------------------------------------------===//
Expand Down
30 changes: 30 additions & 0 deletions llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
Original file line number Diff line number Diff line change
Expand Up @@ -2670,6 +2670,20 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
GPR:$vl, fvti.Log2SEW, TA_MA)>;
}

foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
defvar fvti = fvtiToFWti.Vti;
defvar fwti = fvtiToFWti.Wti;
let Predicates = [HasVInstructionsBF16] in
def : Pat<(fwti.Vector (any_riscv_fpextend_vl
(fvti.Vector fvti.RegClass:$rs1),
(fvti.Mask V0),
VLOpFrag)),
(!cast<Instruction>("PseudoVFWCVTBF16_F_F_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
(fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
(fvti.Mask V0),
GPR:$vl, fvti.Log2SEW, TA_MA)>;
}

// 13.19 Narrowing Floating-Point/Integer Type-Convert Instructions
defm : VPatNConvertFP2IVL_W_RM<riscv_vfcvt_xu_f_vl, "PseudoVFNCVT_XU_F_W">;
defm : VPatNConvertFP2IVL_W_RM<riscv_vfcvt_x_f_vl, "PseudoVFNCVT_X_F_W">;
Expand Down Expand Up @@ -2714,6 +2728,22 @@ foreach fvtiToFWti = AllWidenableFloatVectors in {
}
}

foreach fvtiToFWti = AllWidenableBFloatToFloatVectors in {
defvar fvti = fvtiToFWti.Vti;
defvar fwti = fvtiToFWti.Wti;
let Predicates = [HasVInstructionsBF16] in
def : Pat<(fvti.Vector (any_riscv_fpround_vl
(fwti.Vector fwti.RegClass:$rs1),
(fwti.Mask V0), VLOpFrag)),
(!cast<Instruction>("PseudoVFNCVTBF16_F_F_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
(fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
(fwti.Mask V0),
// Value to indicate no rounding mode change in
// RISCVInsertReadWriteCSR
FRM_DYN,
GPR:$vl, fvti.Log2SEW, TA_MA)>;
}

// 14. Vector Reduction Operations

// 14.1. Vector Single-Width Integer Reduction Instructions
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,10 @@ def TuningFastMOVBE
: SubtargetFeature<"fast-movbe", "HasFastMOVBE", "true",
"Prefer a movbe over a single-use load + bswap / single-use bswap + store">;

def TuningFastImm16
: SubtargetFeature<"fast-imm16", "HasFastImm16", "true",
"Prefer a i16 instruction with i16 immediate over extension to i32">;

def TuningUseSLMArithCosts
: SubtargetFeature<"use-slm-arith-costs", "UseSLMArithCosts", "true",
"Use Silvermont specific arithmetic costs">;
Expand Down Expand Up @@ -1146,6 +1150,7 @@ def ProcessorFeatures {
TuningSlowDivide32,
TuningSlowDivide64,
TuningSlowTwoMemOps,
TuningFastImm16,
TuningLEAUsesAG,
TuningPadShortFunctions,
TuningInsertVZEROUPPER,
Expand All @@ -1166,6 +1171,7 @@ def ProcessorFeatures {
TuningSlowPMULLD,
TuningFast7ByteNOP,
TuningFastMOVBE,
TuningFastImm16,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningNoDomainDelay];
Expand All @@ -1187,6 +1193,7 @@ def ProcessorFeatures {
TuningSlowLEA,
TuningSlowIncDec,
TuningFastMOVBE,
TuningFastImm16,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
TuningNoDomainDelay];
Expand All @@ -1201,6 +1208,7 @@ def ProcessorFeatures {
TuningSlowLEA,
TuningSlowIncDec,
TuningFastMOVBE,
TuningFastImm16,
TuningInsertVZEROUPPER,
TuningNoDomainDelay];
list<SubtargetFeature> GLPFeatures =
Expand Down Expand Up @@ -1321,6 +1329,7 @@ def ProcessorFeatures {
TuningPreferMaskRegisters,
TuningFastGather,
TuningFastMOVBE,
TuningFastImm16,
TuningSlowPMADDWD];
// TODO Add AVX5124FMAPS/AVX5124VNNIW features
list<SubtargetFeature> KNMFeatures =
Expand Down Expand Up @@ -1364,6 +1373,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningSlowSHLD,
TuningFastImm16,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER];

Expand All @@ -1384,6 +1394,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningFastMOVBE,
TuningFastImm16,
TuningSBBDepBreaking,
TuningSlowSHLD];
list<SubtargetFeature> BtVer2Features =
Expand Down Expand Up @@ -1488,6 +1499,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVariablePerLaneShuffle,
TuningFastMOVBE,
TuningFastImm16,
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER,
Expand Down
55 changes: 50 additions & 5 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22690,7 +22690,7 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

// Only promote the compare up to I32 if it is a 16 bit operation
// with an immediate. 16 bit immediates are to be avoided.
if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
!DAG.getMachineFunction().getFunction().hasMinSize()) {
ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
Expand Down Expand Up @@ -29022,6 +29022,29 @@ SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
}

// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
assert((Amt < 8) && "Shift/Rotation amount out of range");
switch (Opcode) {
case ISD::BITREVERSE:
return 0x8040201008040201ULL;
case ISD::SHL:
return ((0x0102040810204080ULL >> (Amt)) &
(0x0101010101010101ULL * (0xFF >> (Amt))));
case ISD::SRL:
return ((0x0102040810204080ULL << (Amt)) &
(0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
case ISD::SRA:
return (getGFNICtrlImm(ISD::SRL, Amt) |
(0x8080808080808080ULL >> (64 - (8 * Amt))));
case ISD::ROTL:
return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
case ISD::ROTR:
return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
}
llvm_unreachable("Unsupported GFNI opcode");
}

// Return true if the required (according to Opcode) shift-imm form is natively
// supported by the Subtarget
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
Expand Down Expand Up @@ -29209,6 +29232,14 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
if (VT == MVT::v16i8 && Subtarget.hasXOP())
return SDValue();

if (Subtarget.hasGFNI()) {
uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt);
MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT));
return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
DAG.getTargetConstant(0, dl, MVT::i8));
}

if (Op.getOpcode() == ISD::SHL) {
// Make a large shift.
SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
Expand Down Expand Up @@ -29892,13 +29923,15 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

if (EltSizeInBits == 8 && ShXAmt > 1 &&
(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) {
if (EltSizeInBits == 8 &&
(Subtarget.hasXOP() ||
(useVPTERNLOG(Subtarget, VT) &&
supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
// For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
// bit-select - lower using vXi16 shifts and then perform the bitmask at
// the original vector width to handle cases where we split.
MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
SDValue ShX =
Expand Down Expand Up @@ -30103,6 +30136,17 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
}

// Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt);
MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT));
return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
DAG.getTargetConstant(0, DL, MVT::i8));
}

// Split 256-bit integers on XOP/pre-AVX2 targets.
if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
return splitVectorIntBinary(Op, DAG, DL);
Expand Down Expand Up @@ -31426,7 +31470,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
if (Subtarget.hasGFNI()) {
MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
SDValue Matrix =
DAG.getConstant(getGFNICtrlImm(ISD::BITREVERSE), DL, MatrixVT);
Matrix = DAG.getBitcast(VT, Matrix);
return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
DAG.getTargetConstant(0, DL, MVT::i8));
Expand Down
21 changes: 21 additions & 0 deletions llvm/lib/Target/X86/X86TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,24 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
Op1Info.getNoProps(), Op2Info.getNoProps());
}

static const CostKindTblEntry GFNIUniformConstCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ ISD::SRL, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ ISD::SRA, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ ISD::SHL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ ISD::SRL, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ ISD::SRA, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ ISD::SHL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ ISD::SRL, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ ISD::SRA, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
};

if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasGFNI())
if (const auto *Entry =
CostTableLookup(GFNIUniformConstCostTable, ISD, LT.second))
if (auto KindCost = Entry->Cost[CostKind])
return LT.first * *KindCost;

static const CostKindTblEntry AVX512BWUniformConstCostTable[] = {
{ ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand.
{ ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand.
Expand Down Expand Up @@ -3869,6 +3887,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
{ ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb
{ ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
{ ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb
{ X86ISD::VROTLI, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ X86ISD::VROTLI, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
{ X86ISD::VROTLI, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb
};
static const CostKindTblEntry GLMCostTbl[] = {
{ ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss
Expand Down
60 changes: 17 additions & 43 deletions llvm/lib/Transforms/Coroutines/CoroFrame.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/PtrUseVisitor.h"
#include "llvm/Analysis/StackLifetime.h"
#include "llvm/Config/llvm-config.h"
Expand Down Expand Up @@ -1441,22 +1440,17 @@ namespace {
struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
using Base = PtrUseVisitor<AllocaUseVisitor>;
AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
const coro::Shape &CoroShape,
const SuspendCrossingInfo &Checker,
const CoroBeginInst &CB, const SuspendCrossingInfo &Checker,
bool ShouldUseLifetimeStartInfo)
: PtrUseVisitor(DL), DT(DT), CoroShape(CoroShape), Checker(Checker),
ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {
for (AnyCoroSuspendInst *SuspendInst : CoroShape.CoroSuspends)
CoroSuspendBBs.insert(SuspendInst->getParent());
}
: PtrUseVisitor(DL), DT(DT), CoroBegin(CB), Checker(Checker),
ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {}

void visit(Instruction &I) {
Users.insert(&I);
Base::visit(I);
// If the pointer is escaped prior to CoroBegin, we have to assume it would
// be written into before CoroBegin as well.
if (PI.isEscaped() &&
!DT.dominates(CoroShape.CoroBegin, PI.getEscapingInst())) {
if (PI.isEscaped() && !DT.dominates(&CoroBegin, PI.getEscapingInst())) {
MayWriteBeforeCoroBegin = true;
}
}
Expand Down Expand Up @@ -1559,19 +1553,10 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
// When we found the lifetime markers refers to a
// subrange of the original alloca, ignore the lifetime
// markers to avoid misleading the analysis.
if (!IsOffsetKnown || !Offset.isZero())
return Base::visitIntrinsicInst(II);
switch (II.getIntrinsicID()) {
default:
if (II.getIntrinsicID() != Intrinsic::lifetime_start || !IsOffsetKnown ||
!Offset.isZero())
return Base::visitIntrinsicInst(II);
case Intrinsic::lifetime_start:
LifetimeStarts.insert(&II);
LifetimeStartBBs.push_back(II.getParent());
break;
case Intrinsic::lifetime_end:
LifetimeEndBBs.insert(II.getParent());
break;
}
LifetimeStarts.insert(&II);
}

void visitCallBase(CallBase &CB) {
Expand Down Expand Up @@ -1601,17 +1586,14 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {

private:
const DominatorTree &DT;
const coro::Shape &CoroShape;
const CoroBeginInst &CoroBegin;
const SuspendCrossingInfo &Checker;
// All alias to the original AllocaInst, created before CoroBegin and used
// after CoroBegin. Each entry contains the instruction and the offset in the
// original Alloca. They need to be recreated after CoroBegin off the frame.
DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{};
SmallPtrSet<Instruction *, 4> Users{};
SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{};
SmallVector<BasicBlock *> LifetimeStartBBs{};
SmallPtrSet<BasicBlock *, 2> LifetimeEndBBs{};
SmallPtrSet<const BasicBlock *, 2> CoroSuspendBBs{};
bool MayWriteBeforeCoroBegin{false};
bool ShouldUseLifetimeStartInfo{true};

Expand All @@ -1623,19 +1605,10 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
// every basic block that uses the pointer to see if they cross suspension
// points. The uses cover both direct uses as well as indirect uses.
if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) {
// If there is no explicit lifetime.end, then assume the address can
// cross suspension points.
if (LifetimeEndBBs.empty())
return true;

// If there is a path from a lifetime.start to a suspend without a
// corresponding lifetime.end, then the alloca's lifetime persists
// beyond that suspension point and the alloca must go on the frame.
llvm::SmallVector<BasicBlock *> Worklist(LifetimeStartBBs);
if (isManyPotentiallyReachableFromMany(Worklist, CoroSuspendBBs,
&LifetimeEndBBs, &DT))
return true;

for (auto *I : Users)
for (auto *S : LifetimeStarts)
if (Checker.isDefinitionAcrossSuspend(*S, I))
return true;
// Addresses are guaranteed to be identical after every lifetime.start so
// we cannot use the local stack if the address escaped and there is a
// suspend point between lifetime markers. This should also cover the
Expand Down Expand Up @@ -1673,13 +1646,13 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
}

void handleMayWrite(const Instruction &I) {
if (!DT.dominates(CoroShape.CoroBegin, &I))
if (!DT.dominates(&CoroBegin, &I))
MayWriteBeforeCoroBegin = true;
}

bool usedAfterCoroBegin(Instruction &I) {
for (auto &U : I.uses())
if (DT.dominates(CoroShape.CoroBegin, U))
if (DT.dominates(&CoroBegin, U))
return true;
return false;
}
Expand All @@ -1688,7 +1661,7 @@ struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
// We track all aliases created prior to CoroBegin but used after.
// These aliases may need to be recreated after CoroBegin if the alloca
// need to live on the frame.
if (DT.dominates(CoroShape.CoroBegin, &I) || !usedAfterCoroBegin(I))
if (DT.dominates(&CoroBegin, &I) || !usedAfterCoroBegin(I))
return;

if (!IsOffsetKnown) {
Expand Down Expand Up @@ -2857,7 +2830,8 @@ static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape,
bool ShouldUseLifetimeStartInfo =
(Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
Shape.ABI != coro::ABI::RetconOnce);
AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT, Shape, Checker,
AllocaUseVisitor Visitor{AI->getModule()->getDataLayout(), DT,
*Shape.CoroBegin, Checker,
ShouldUseLifetimeStartInfo};
Visitor.visitPtr(*AI);
if (!Visitor.getShouldLiveOnFrame())
Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Transforms/IPO/FunctionAttrs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1186,10 +1186,15 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
switch (RVI->getOpcode()) {
// Extend the analysis by looking upwards.
case Instruction::BitCast:
case Instruction::GetElementPtr:
case Instruction::AddrSpaceCast:
FlowsToReturn.insert(RVI->getOperand(0));
continue;
case Instruction::GetElementPtr:
if (cast<GEPOperator>(RVI)->isInBounds()) {
FlowsToReturn.insert(RVI->getOperand(0));
continue;
}
return false;
case Instruction::Select: {
SelectInst *SI = cast<SelectInst>(RVI);
FlowsToReturn.insert(SI->getTrueValue());
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,13 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
IC.Builder.CreateBinaryIntrinsic(Intrinsic::cttz, C, Op1);
return BinaryOperator::CreateSub(ConstCttz, X);
}

// cttz(add(lshr(UINT_MAX, %val), 1)) --> sub(width, %val)
if (match(Op0, m_Add(m_LShr(m_AllOnes(), m_Value(X)), m_One()))) {
Value *Width =
ConstantInt::get(II.getType(), II.getType()->getScalarSizeInBits());
return BinaryOperator::CreateSub(Width, X);
}
} else {
// ctlz(lshr(%const, %val), 1) --> add(ctlz(%const, 1), %val)
if (match(Op0, m_LShr(m_ImmConstant(C), m_Value(X))) &&
Expand Down
14 changes: 1 addition & 13 deletions llvm/lib/Transforms/Utils/CloneFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,18 +386,6 @@ struct PruningFunctionCloner {
};
} // namespace

static bool hasRoundingModeOperand(Intrinsic::ID CIID) {
switch (CIID) {
#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC) \
case Intrinsic::INTRINSIC: \
return ROUND_MODE == 1;
#define FUNCTION INSTRUCTION
#include "llvm/IR/ConstrainedOps.def"
default:
llvm_unreachable("Unexpected constrained intrinsic id");
}
}

Instruction *
PruningFunctionCloner::cloneInstruction(BasicBlock::const_iterator II) {
const Instruction &OldInst = *II;
Expand Down Expand Up @@ -455,7 +443,7 @@ PruningFunctionCloner::cloneInstruction(BasicBlock::const_iterator II) {
// The last arguments of a constrained intrinsic are metadata that
// represent rounding mode (absents in some intrinsics) and exception
// behavior. The inlined function uses default settings.
if (hasRoundingModeOperand(CIID))
if (Intrinsic::hasConstrainedFPRoundingModeOperand(CIID))
Args.push_back(
MetadataAsValue::get(Ctx, MDString::get(Ctx, "round.tonearest")));
Args.push_back(
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Analysis/CostModel/ARM/arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve,+mve4beat < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE4
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE

target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Analysis/CostModel/ARM/cast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE

target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE

target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Analysis/CostModel/ARM/cmps.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-RECIP
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R-SIZE

target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Analysis/CostModel/ARM/divrem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8R
; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R

target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"

Expand Down
Loading