182 changes: 161 additions & 21 deletions lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/MipsABIFlags.h"
#include "lldb/Target/Process.h"

#define CASE_AND_STREAM(s, def, width) \
case def: \
Expand Down Expand Up @@ -3007,9 +3008,10 @@ void ObjectFileELF::ParseSymtab(Symtab &lldb_symtab) {
// section, nomatter if .symtab was already parsed or not. This is because
// minidebuginfo normally removes the .symtab symbols which have their
// matching .dynsym counterparts.
Section *dynsym = nullptr;
if (!symtab ||
GetSectionList()->FindSectionByName(ConstString(".gnu_debugdata"))) {
Section *dynsym =
dynsym =
section_list->FindSectionByType(eSectionTypeELFDynamicSymbols, true)
.get();
if (dynsym) {
Expand All @@ -3019,6 +3021,20 @@ void ObjectFileELF::ParseSymtab(Symtab &lldb_symtab) {
m_address_class_map.merge(address_class_map);
}
}
if (!dynsym) {
// Try and read the dynamic symbol table from the .dynamic section.
uint32_t num_symbols = 0;
std::optional<DataExtractor> symtab_data =
GetDynsymDataFromDynamic(num_symbols);
std::optional<DataExtractor> strtab_data = GetDynstrData();
if (symtab_data && strtab_data) {
auto [num_symbols_parsed, address_class_map] =
ParseSymbols(&lldb_symtab, symbol_id, section_list, num_symbols,
symtab_data.value(), strtab_data.value());
symbol_id += num_symbols_parsed;
m_address_class_map.merge(address_class_map);
}
}

// DT_JMPREL
// If present, this entry's d_ptr member holds the address of
Expand Down Expand Up @@ -3828,6 +3844,33 @@ ObjectFileELF::MapFileDataWritable(const FileSpec &file, uint64_t Size,
Offset);
}

std::optional<DataExtractor>
ObjectFileELF::ReadDataFromDynamic(const ELFDynamic *dyn, uint64_t length,
uint64_t offset) {
// ELFDynamic values contain a "d_ptr" member that will be a load address if
// we have an ELF file read from memory, or it will be a file address if it
// was read from a ELF file. This function will correctly fetch data pointed
// to by the ELFDynamic::d_ptr, or return std::nullopt if the data isn't
// available.
const lldb::addr_t d_ptr_addr = dyn->d_ptr + offset;
if (ProcessSP process_sp = m_process_wp.lock()) {
if (DataBufferSP data_sp = ReadMemory(process_sp, d_ptr_addr, length))
return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize());
} else {
// We have an ELF file with no section headers or we didn't find the
// .dynamic section. Try and find the .dynstr section.
Address addr;
if (!addr.ResolveAddressUsingFileSections(d_ptr_addr, GetSectionList()))
return std::nullopt;
DataExtractor data;
addr.GetSection()->GetSectionData(data);
return DataExtractor(data,
d_ptr_addr - addr.GetSection()->GetFileAddress(),
length);
}
return std::nullopt;
}

std::optional<DataExtractor> ObjectFileELF::GetDynstrData() {
if (SectionList *section_list = GetSectionList()) {
// Find the SHT_DYNAMIC section.
Expand Down Expand Up @@ -3855,31 +3898,15 @@ std::optional<DataExtractor> ObjectFileELF::GetDynstrData() {
// and represent the dynamic symbol tables's string table. These are needed
// by the dynamic loader and we can read them from a process' address space.
//
// When loading and ELF file from memory, only the program headers end up
// being mapped into memory, and we can find these values in the PT_DYNAMIC
// segment.
// When loading and ELF file from memory, only the program headers are
// guaranteed end up being mapped into memory, and we can find these values in
// the PT_DYNAMIC segment.
const ELFDynamic *strtab = FindDynamicSymbol(DT_STRTAB);
const ELFDynamic *strsz = FindDynamicSymbol(DT_STRSZ);
if (strtab == nullptr || strsz == nullptr)
return std::nullopt;

if (ProcessSP process_sp = m_process_wp.lock()) {
if (DataBufferSP data_sp =
ReadMemory(process_sp, strtab->d_ptr, strsz->d_val))
return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize());
} else {
// We have an ELF file with no section headers or we didn't find the
// .dynamic section. Try and find the .dynstr section.
Address addr;
if (addr.ResolveAddressUsingFileSections(strtab->d_ptr, GetSectionList())) {
DataExtractor data;
addr.GetSection()->GetSectionData(data);
return DataExtractor(data,
strtab->d_ptr - addr.GetSection()->GetFileAddress(),
strsz->d_val);
}
}
return std::nullopt;
return ReadDataFromDynamic(strtab, strsz->d_val, /*offset=*/0);
}

std::optional<lldb_private::DataExtractor> ObjectFileELF::GetDynamicData() {
Expand Down Expand Up @@ -3912,3 +3939,116 @@ std::optional<lldb_private::DataExtractor> ObjectFileELF::GetDynamicData() {
}
return std::nullopt;
}

std::optional<uint32_t> ObjectFileELF::GetNumSymbolsFromDynamicHash() {
const ELFDynamic *hash = FindDynamicSymbol(DT_HASH);
if (hash == nullptr)
return std::nullopt;

// The DT_HASH header looks like this:
struct DtHashHeader {
uint32_t nbucket;
uint32_t nchain;
};
if (auto data = ReadDataFromDynamic(hash, 8)) {
// We don't need the number of buckets value "nbucket", we just need the
// "nchain" value which contains the number of symbols.
offset_t offset = offsetof(DtHashHeader, nchain);
return data->GetU32(&offset);
}

return std::nullopt;
}

std::optional<uint32_t> ObjectFileELF::GetNumSymbolsFromDynamicGnuHash() {
const ELFDynamic *gnu_hash = FindDynamicSymbol(DT_GNU_HASH);
if (gnu_hash == nullptr)
return std::nullopt;

// Create a DT_GNU_HASH header
// https://flapenguin.me/elf-dt-gnu-hash
struct DtGnuHashHeader {
uint32_t nbuckets = 0;
uint32_t symoffset = 0;
uint32_t bloom_size = 0;
uint32_t bloom_shift = 0;
};
uint32_t num_symbols = 0;
// Read enogh data for the DT_GNU_HASH header so we can extract the values.
if (auto data = ReadDataFromDynamic(gnu_hash, sizeof(DtGnuHashHeader))) {
offset_t offset = 0;
DtGnuHashHeader header;
header.nbuckets = data->GetU32(&offset);
header.symoffset = data->GetU32(&offset);
header.bloom_size = data->GetU32(&offset);
header.bloom_shift = data->GetU32(&offset);
const size_t addr_size = GetAddressByteSize();
const addr_t buckets_offset =
sizeof(DtGnuHashHeader) + addr_size * header.bloom_size;
std::vector<uint32_t> buckets;
if (auto bucket_data = ReadDataFromDynamic(gnu_hash, header.nbuckets * 4, buckets_offset)) {
offset = 0;
for (uint32_t i = 0; i < header.nbuckets; ++i)
buckets.push_back(bucket_data->GetU32(&offset));
// Locate the chain that handles the largest index bucket.
uint32_t last_symbol = 0;
for (uint32_t bucket_value : buckets)
last_symbol = std::max(bucket_value, last_symbol);
if (last_symbol < header.symoffset) {
num_symbols = header.symoffset;
} else {
// Walk the bucket's chain to add the chain length to the total.
const addr_t chains_base_offset = buckets_offset + header.nbuckets * 4;
for (;;) {
if (auto chain_entry_data = ReadDataFromDynamic(gnu_hash, 4, chains_base_offset + (last_symbol - header.symoffset) * 4)) {
offset = 0;
uint32_t chain_entry = chain_entry_data->GetU32(&offset);
++last_symbol;
// If the low bit is set, this entry is the end of the chain.
if (chain_entry & 1)
break;
} else {
break;
}
}
num_symbols = last_symbol;
}
}
}
if (num_symbols > 0)
return num_symbols;

return std::nullopt;
}

std::optional<DataExtractor>
ObjectFileELF::GetDynsymDataFromDynamic(uint32_t &num_symbols) {
// Every ELF file which represents an executable or shared library has
// mandatory .dynamic entries. The DT_SYMTAB value contains a pointer to the
// symbol table, and DT_SYMENT contains the size of a symbol table entry.
// We then can use either the DT_HASH or DT_GNU_HASH to find the number of
// symbols in the symbol table as the symbol count is not stored in the
// .dynamic section as a key/value pair.
//
// When loading and ELF file from memory, only the program headers end up
// being mapped into memory, and we can find these values in the PT_DYNAMIC
// segment.
num_symbols = 0;
// Get the process in case this is an in memory ELF file.
ProcessSP process_sp(m_process_wp.lock());
const ELFDynamic *symtab = FindDynamicSymbol(DT_SYMTAB);
const ELFDynamic *syment = FindDynamicSymbol(DT_SYMENT);
// DT_SYMTAB and DT_SYMENT are mandatory.
if (symtab == nullptr || syment == nullptr)
return std::nullopt;

if (std::optional<uint32_t> syms = GetNumSymbolsFromDynamicHash())
num_symbols = *syms;
else if (std::optional<uint32_t> syms = GetNumSymbolsFromDynamicGnuHash())
num_symbols = *syms;
else
return std::nullopt;
if (num_symbols == 0)
return std::nullopt;
return ReadDataFromDynamic(symtab, syment->d_val * num_symbols);
}
41 changes: 41 additions & 0 deletions lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,47 @@ class ObjectFileELF : public lldb_private::ObjectFile {
/// \return The bytes that represent the string table data or \c std::nullopt
/// if an error occured.
std::optional<lldb_private::DataExtractor> GetDynstrData();

/// Read the bytes pointed to by the \a dyn dynamic entry.
///
/// ELFDynamic::d_ptr values contain file addresses if we load the ELF file
/// form a file on disk, or they contain load addresses if they were read
/// from memory. This function will correctly extract the data in both cases
/// if it is available.
///
/// \param[in] dyn The dynamic entry to use to fetch the data from.
///
/// \param[in] length The number of bytes to read.
///
/// \param[in] offset The number of bytes to skip after the d_ptr value
/// before reading data.
///
/// \return The bytes that represent the dynanic entries data or
/// \c std::nullopt if an error occured or the data is not available.
std::optional<lldb_private::DataExtractor>
ReadDataFromDynamic(const elf::ELFDynamic *dyn, uint64_t length,
uint64_t offset = 0);

/// Get the bytes that represent the dynamic symbol table from the .dynamic
/// section from process memory.
///
/// This functon uses the DT_SYMTAB value from the .dynamic section to read
/// the symbols table data from process memory. The number of symbols in the
/// symbol table is calculated by looking at the DT_HASH or DT_GNU_HASH
/// values as the symbol count isn't stored in the .dynamic section.
///
/// \return The bytes that represent the symbol table data from the .dynamic
/// section or section headers or \c std::nullopt if an error
/// occured or if there is no dynamic symbol data available.
std::optional<lldb_private::DataExtractor>
GetDynsymDataFromDynamic(uint32_t &num_symbols);

/// Get the number of symbols from the DT_HASH dynamic entry.
std::optional<uint32_t> GetNumSymbolsFromDynamicHash();

/// Get the number of symbols from the DT_GNU_HASH dynamic entry.
std::optional<uint32_t> GetNumSymbolsFromDynamicGnuHash();

};

#endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_ELF_OBJECTFILEELF_H
42 changes: 42 additions & 0 deletions lldb/test/Shell/ObjectFile/ELF/elf-dynsym.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// This test verifies that loading an ELF file that has no section headers can
// load the dynamic symbol table using the DT_SYMTAB, DT_SYMENT, DT_HASH or
// the DT_GNU_HASH .dynamic key/value pairs that are loaded via the PT_DYNAMIC
// segment.

// RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj \
// RUN: -o - - <<<".globl defined, undefined; defined:" | \
// RUN: ld.lld /dev/stdin -o - --hash-style=gnu -export-dynamic -shared \
// RUN: -z nosectionheader -o %t.gnu
// RUN: %lldb %t.gnu -b \
// RUN: -o "image dump objfile" \
// RUN: | FileCheck %s --dump-input=always --check-prefix=GNU
// GNU: (lldb) image dump objfile
// GNU: Dumping headers for 1 module(s).
// GNU: ObjectFileELF, file =
// GNU: ELF Header
// GNU: e_type = 0x0003 ET_DYN
// Make sure there are no section headers
// GNU: e_shnum = 0x00000000
// Make sure we were able to load the symbols
// GNU: Symtab, file = {{.*}}elf-dynsym.test.tmp.gnu, num_symbols = 2:
// GNU-DAG: undefined
// GNU-DAG: defined

// RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj \
// RUN: -o - - <<<".globl defined, undefined; defined:" | \
// RUN: ld.lld /dev/stdin -o - --hash-style=sysv -export-dynamic -shared \
// RUN: -z nosectionheader -o %t.sysv
// RUN: %lldb %t.sysv -b \
// RUN: -o "image dump objfile" \
// RUN: | FileCheck %s --dump-input=always --check-prefix=HASH
// HASH: (lldb) image dump objfile
// HASH: Dumping headers for 1 module(s).
// HASH: ObjectFileELF, file =
// HASH: ELF Header
// HASH: e_type = 0x0003 ET_DYN
// Make sure there are no section headers
// HASH: e_shnum = 0x00000000
// Make sure we were able to load the symbols
// HASH: Symtab, file = {{.*}}elf-dynsym.test.tmp.sysv, num_symbols = 2:
// HASH-DAG: undefined
// HASH-DAG: defined
11 changes: 10 additions & 1 deletion llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,13 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
work-item
IDs

``gfx950`` ``amdgcn`` dGPU - sramecc - Architected *TBA*
- tgsplit flat
- xnack scratch .. TODO::
- kernarg preload - Packed
work-item Add product
IDs names.

**GCN GFX10.1 (RDNA 1)** [AMD-GCN-GFX10-RDNA1]_
-----------------------------------------------------------------------------------------------------------------------
``gfx1010`` ``amdgcn`` dGPU - cumode - Absolute - *rocm-amdhsa* - Radeon RX 5700
Expand Down Expand Up @@ -2178,7 +2185,7 @@ The AMDGPU backend uses the following ELF header:
``EF_AMDGPU_MACH_AMDGCN_GFX942`` 0x04c ``gfx942``
*reserved* 0x04d Reserved.
``EF_AMDGPU_MACH_AMDGCN_GFX1201`` 0x04e ``gfx1201``
*reserved* 0x04f Reserved.
``EF_AMDGPU_MACH_AMDGCN_GFX950`` 0x04f ``gfx950``
*reserved* 0x050 Reserved.
``EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC`` 0x051 ``gfx9-generic``
``EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC`` 0x052 ``gfx10-1-generic``
Expand Down Expand Up @@ -5468,6 +5475,8 @@ The fields used by CP for code objects before V3 also match those specified in
roundup(lds-size / (64 * 4))
GFX7-GFX11
roundup(lds-size / (128 * 4))
GFX950
roundup(lds-size / (320 * 4))

24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution
_INVALID_OPERATION with specified exceptions
Expand Down
2 changes: 0 additions & 2 deletions llvm/include/llvm/Analysis/InlineAdvisor.h
Original file line number Diff line number Diff line change
Expand Up @@ -287,15 +287,13 @@ class PluginInlineAdvisorAnalysis
: public AnalysisInfoMixin<PluginInlineAdvisorAnalysis> {
public:
static AnalysisKey Key;
static bool HasBeenRegistered;

typedef InlineAdvisor *(*AdvisorFactory)(Module &M,
FunctionAnalysisManager &FAM,
InlineParams Params,
InlineContext IC);

PluginInlineAdvisorAnalysis(AdvisorFactory Factory) : Factory(Factory) {
HasBeenRegistered = true;
assert(Factory != nullptr &&
"The plugin advisor factory should not be a null pointer.");
}
Expand Down
5 changes: 0 additions & 5 deletions llvm/include/llvm/Analysis/InlineOrder.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ class PluginInlineOrderAnalysis
ModuleAnalysisManager &MAM, Module &M);

PluginInlineOrderAnalysis(InlineOrderFactory Factory) : Factory(Factory) {
HasBeenRegistered = true;
assert(Factory != nullptr &&
"The plugin inline order factory should not be a null pointer.");
}
Expand All @@ -71,11 +70,7 @@ class PluginInlineOrderAnalysis
Result run(Module &, ModuleAnalysisManager &) { return {Factory}; }
Result getResult() { return {Factory}; }

static bool isRegistered() { return HasBeenRegistered; }
static void unregister() { HasBeenRegistered = false; }

private:
static bool HasBeenRegistered;
InlineOrderFactory Factory;
};

Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/BinaryFormat/ELF.h
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ enum : unsigned {
EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d,
EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F = 0x04f,
EF_AMDGPU_MACH_AMDGCN_GFX950 = 0x04f,
EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050,
EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051,
EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052,
Expand Down
12 changes: 8 additions & 4 deletions llvm/include/llvm/IR/PassManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,11 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
AnalysisResultLists.clear();
}

/// Returns true if the specified analysis pass is registered.
template <typename PassT> bool isPassRegistered() const {
return AnalysisPasses.count(PassT::ID());
}

/// Get the result of an analysis pass for a given IR unit.
///
/// Runs the analysis if a cached result is not available.
Expand Down Expand Up @@ -458,10 +463,9 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
/// and this function returns true.
///
/// (Note: Although the return value of this function indicates whether or not
/// an analysis was previously registered, there intentionally isn't a way to
/// query this directly. Instead, you should just register all the analyses
/// you might want and let this class run them lazily. This idiom lets us
/// minimize the number of times we have to look up analyses in our
/// an analysis was previously registered, you should just register all the
/// analyses you might want and let this class run them lazily. This idiom
/// lets us minimize the number of times we have to look up analyses in our
/// hashtable.)
template <typename PassBuilderT>
bool registerPass(PassBuilderT &&PassBuilder) {
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/ProfileData/InstrProfWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ class InstrProfWriter {
const llvm::SmallVector<memprof::FrameId> &CallStack,
function_ref<void(Error)> Warn);

/// Add the entire MemProfData \p Incoming to the writer context.
bool addMemProfData(memprof::IndexedMemProfData Incoming,
function_ref<void(Error)> Warn);

// Add a binary id to the binary ids list.
void addBinaryIds(ArrayRef<llvm::object::BuildID> BIs);

Expand Down
25 changes: 13 additions & 12 deletions llvm/include/llvm/TargetParser/TargetParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,19 @@ enum GPUKind : uint32_t {
GK_GFX940 = 68,
GK_GFX941 = 69,
GK_GFX942 = 70,

GK_GFX1010 = 71,
GK_GFX1011 = 72,
GK_GFX1012 = 73,
GK_GFX1013 = 74,
GK_GFX1030 = 75,
GK_GFX1031 = 76,
GK_GFX1032 = 77,
GK_GFX1033 = 78,
GK_GFX1034 = 79,
GK_GFX1035 = 80,
GK_GFX1036 = 81,
GK_GFX950 = 71,

GK_GFX1010 = 72,
GK_GFX1011 = 73,
GK_GFX1012 = 74,
GK_GFX1013 = 75,
GK_GFX1030 = 76,
GK_GFX1031 = 77,
GK_GFX1032 = 78,
GK_GFX1033 = 79,
GK_GFX1034 = 80,
GK_GFX1035 = 81,
GK_GFX1036 = 82,

GK_GFX1100 = 90,
GK_GFX1101 = 91,
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Analysis/InlineAdvisor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,12 @@ void InlineAdvice::recordInliningWithCalleeDeleted() {

AnalysisKey InlineAdvisorAnalysis::Key;
AnalysisKey PluginInlineAdvisorAnalysis::Key;
bool PluginInlineAdvisorAnalysis::HasBeenRegistered = false;

bool InlineAdvisorAnalysis::Result::tryCreate(
InlineParams Params, InliningAdvisorMode Mode,
const ReplayInlinerSettings &ReplaySettings, InlineContext IC) {
auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
if (PluginInlineAdvisorAnalysis::HasBeenRegistered) {
if (MAM.isPassRegistered<PluginInlineAdvisorAnalysis>()) {
auto &DA = MAM.getResult<PluginInlineAdvisorAnalysis>(M);
Advisor.reset(DA.Factory(M, FAM, Params, IC));
return !!Advisor;
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Analysis/InlineOrder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,6 @@ class PriorityInlineOrder : public InlineOrder<std::pair<CallBase *, int>> {
} // namespace

AnalysisKey llvm::PluginInlineOrderAnalysis::Key;
bool llvm::PluginInlineOrderAnalysis::HasBeenRegistered;

std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>>
llvm::getDefaultInlineOrder(FunctionAnalysisManager &FAM,
Expand Down Expand Up @@ -313,7 +312,7 @@ llvm::getDefaultInlineOrder(FunctionAnalysisManager &FAM,
std::unique_ptr<InlineOrder<std::pair<CallBase *, int>>>
llvm::getInlineOrder(FunctionAnalysisManager &FAM, const InlineParams &Params,
ModuleAnalysisManager &MAM, Module &M) {
if (llvm::PluginInlineOrderAnalysis::isRegistered()) {
if (MAM.isPassRegistered<PluginInlineOrderAnalysis>()) {
LLVM_DEBUG(dbgs() << " Current used priority: plugin ---- \n");
return MAM.getResult<PluginInlineOrderAnalysis>(M).Factory(FAM, Params, MAM,
M);
Expand Down
12 changes: 7 additions & 5 deletions llvm/lib/CodeGen/MachineBlockPlacement.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3558,14 +3558,16 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {

if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
/*AfterPlacement=*/true)) {
// Redo the layout if tail merging creates/removes/moves blocks.
BlockToChain.clear();
ComputedEdges.clear();
// Must redo the post-dominator tree if blocks were changed.
if (MPDT)
MPDT->recalculate(MF);
ChainAllocator.DestroyAll();
buildCFGChains();
if (!UseExtTspForSize) {
// Redo the layout if tail merging creates/removes/moves blocks.
BlockToChain.clear();
ComputedEdges.clear();
ChainAllocator.DestroyAll();
buildCFGChains();
}
}
}

Expand Down
36 changes: 36 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
case ISD::EXTRACT_VECTOR_ELT:
Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
case ISD::LOAD: Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N)); break;
case ISD::VP_LOAD:
Res = PromoteIntRes_VP_LOAD(cast<VPLoadSDNode>(N));
break;
case ISD::MLOAD: Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));
break;
case ISD::MGATHER: Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
Expand Down Expand Up @@ -957,6 +960,23 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
return Res;
}

SDValue DAGTypeLegalizer::PromoteIntRes_VP_LOAD(VPLoadSDNode *N) {
assert(!N->isIndexed() && "Indexed vp_load during type legalization!");
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
ISD::LoadExtType ExtType = (N->getExtensionType() == ISD::NON_EXTLOAD)
? ISD::EXTLOAD
: N->getExtensionType();
SDLoc dl(N);
SDValue Res =
DAG.getLoadVP(N->getAddressingMode(), ExtType, NVT, dl, N->getChain(),
N->getBasePtr(), N->getOffset(), N->getMask(),
N->getVectorLength(), N->getMemoryVT(), N->getMemOperand());
// Legalize the chain result - switch anything that used the old chain to
// use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
return Res;
}

SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue ExtPassThru = GetPromotedInteger(N->getPassThru());
Expand Down Expand Up @@ -1957,6 +1977,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
case ISD::STRICT_SINT_TO_FP: Res = PromoteIntOp_STRICT_SINT_TO_FP(N); break;
case ISD::STORE: Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
OpNo); break;
case ISD::VP_STORE:
Res = PromoteIntOp_VP_STORE(cast<VPStoreSDNode>(N), OpNo);
break;
case ISD::MSTORE: Res = PromoteIntOp_MSTORE(cast<MaskedStoreSDNode>(N),
OpNo); break;
case ISD::MLOAD: Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
Expand Down Expand Up @@ -2378,6 +2401,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
N->getMemoryVT(), N->getMemOperand());
}

SDValue DAGTypeLegalizer::PromoteIntOp_VP_STORE(VPStoreSDNode *N,
unsigned OpNo) {

assert(OpNo == 1 && "Unexpected operand for promotion");
assert(!N->isIndexed() && "expecting unindexed vp_store!");

SDValue DataOp = GetPromotedInteger(N->getValue());
return DAG.getTruncStoreVP(N->getChain(), SDLoc(N), DataOp, N->getBasePtr(),
N->getMask(), N->getVectorLength(),
N->getMemoryVT(), N->getMemOperand(),
N->isCompressingStore());
}

SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
unsigned OpNo) {
SDValue DataOp = N->getValue();
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntRes_FREEZE(SDNode *N);
SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
SDValue PromoteIntRes_LOAD(LoadSDNode *N);
SDValue PromoteIntRes_VP_LOAD(VPLoadSDNode *N);
SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
SDValue PromoteIntRes_VECTOR_COMPRESS(SDNode *N);
Expand Down Expand Up @@ -420,6 +421,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntOp_ExpOp(SDNode *N);
SDValue PromoteIntOp_VECREDUCE(SDNode *N);
SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo);
SDValue PromoteIntOp_SET_ROUNDING(SDNode *N);
SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo);
SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Object/ELFObjectFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
return "gfx941";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942:
return "gfx942";
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950:
return "gfx950";

// AMDGCN GFX10.
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/ObjectYAML/ELFYAML.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX941, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX942, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX950, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH);
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH);
Expand Down
29 changes: 29 additions & 0 deletions llvm/lib/ProfileData/InstrProfWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,35 @@ bool InstrProfWriter::addMemProfCallStack(
return true;
}

bool InstrProfWriter::addMemProfData(memprof::IndexedMemProfData Incoming,
function_ref<void(Error)> Warn) {
// TODO: Once we remove support for MemProf format Version V1, assert that
// the three components (frames, call stacks, and records) are either all
// empty or populated.

if (MemProfData.Frames.empty())
MemProfData.Frames = std::move(Incoming.Frames);
else
for (const auto &[Id, F] : Incoming.Frames)
if (addMemProfFrame(Id, F, Warn))
return false;

if (MemProfData.CallStacks.empty())
MemProfData.CallStacks = std::move(Incoming.CallStacks);
else
for (const auto &[CSId, CS] : Incoming.CallStacks)
if (addMemProfCallStack(CSId, CS, Warn))
return false;

if (MemProfData.Records.empty())
MemProfData.Records = std::move(Incoming.Records);
else
for (const auto &[GUID, Record] : Incoming.Records)
addMemProfRecord(GUID, Record);

return true;
}

void InstrProfWriter::addBinaryIds(ArrayRef<llvm::object::BuildID> BIs) {
llvm::append_range(BinaryIds, BIs);
}
Expand Down
51 changes: 48 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,18 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts",
"Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions"
>;

def FeatureMinimum3Maximum3F32 : SubtargetFeature<"minimum3-maximum3-f32",
"HasMinimum3Maximum3F32",
"true",
"Has v_minimum3_f32 and v_maximum3_f32 instructions"
>;

def FeatureMinimum3Maximum3F16 : SubtargetFeature<"minimum3-maximum3-f16",
"HasMinimum3Maximum3F16",
"true",
"Has v_minimum3_f16 and v_maximum3_f16 instructions"
>;

def FeatureSupportsXNACK : SubtargetFeature<"xnack-support",
"SupportsXNACK",
"true",
Expand Down Expand Up @@ -360,6 +372,12 @@ def FeatureGFX940Insts : SubtargetFeature<"gfx940-insts",
"Additional instructions for GFX940+"
>;

def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
"GFX950Insts",
"true",
"Additional instructions for GFX950+"
>;

def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
"GFX10Insts",
"true",
Expand Down Expand Up @@ -1174,7 +1192,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",

def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
"gfx9",
[FeatureFP64, FeatureAddressableLocalMemorySize65536,
[FeatureFP64,
FeatureWavefrontSize64, FeatureFlatAddressSpace,
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
Expand Down Expand Up @@ -1257,6 +1275,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureUnalignedDSAccess, FeatureTrue16BitInsts,
FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
FeatureMinimum3Maximum3F32, FeatureMinimum3Maximum3F16,
FeatureAgentScopeFineGrainedRemoteMemoryAtomics
]
>;
Expand Down Expand Up @@ -1339,6 +1358,7 @@ def FeatureISAVersion8_1_0 : FeatureSet<

def FeatureISAVersion9_0_Common : FeatureSet<
[FeatureGFX9,
FeatureAddressableLocalMemorySize65536,
FeatureLDSBankCount32,
FeatureImageInsts,
FeatureMadMacF32Insts]>;
Expand All @@ -1356,7 +1376,8 @@ def FeatureISAVersion9_Generic : FeatureSet<

def FeatureISAVersion9_0_MI_Common : FeatureSet<
!listconcat(FeatureISAVersion9_0_Common.Features,
[FeatureFmaMixInsts,
[FeatureAddressableLocalMemorySize65536,
FeatureFmaMixInsts,
FeatureDLInsts,
FeatureDot1Insts,
FeatureDot2Insts,
Expand Down Expand Up @@ -1470,9 +1491,19 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureFlatBufferGlobalAtomicFaddF64Inst
]>;

def FeatureISAVersion9_5_Common : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
[FeatureAddressableLocalMemorySize163840,
FeatureFP8Insts,
FeatureFP8ConversionInsts,
FeatureCvtFP8VOP1Bug,
FeatureGFX950Insts,
])>;

def FeatureISAVersion9_4_0 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
[
FeatureAddressableLocalMemorySize65536,
FeatureForceStoreSC0SC1,
FeatureFP8Insts,
FeatureFP8ConversionInsts,
Expand All @@ -1483,6 +1514,7 @@ def FeatureISAVersion9_4_0 : FeatureSet<
def FeatureISAVersion9_4_1 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
[
FeatureAddressableLocalMemorySize65536,
FeatureForceStoreSC0SC1,
FeatureFP8Insts,
FeatureFP8ConversionInsts,
Expand All @@ -1493,6 +1525,7 @@ def FeatureISAVersion9_4_1 : FeatureSet<
def FeatureISAVersion9_4_2 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
[
FeatureAddressableLocalMemorySize65536,
FeatureFP8Insts,
FeatureFP8ConversionInsts,
FeatureCvtFP8VOP1Bug,
Expand All @@ -1501,7 +1534,10 @@ def FeatureISAVersion9_4_2 : FeatureSet<

def FeatureISAVersion9_4_Generic : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
[FeatureRequiresCOV6])>;
[FeatureAddressableLocalMemorySize65536,
FeatureRequiresCOV6])>;

def FeatureISAVersion9_5_0 : FeatureSet<FeatureISAVersion9_5_Common.Features>;

def FeatureISAVersion10_Common : FeatureSet<
[FeatureGFX10,
Expand Down Expand Up @@ -1989,6 +2025,15 @@ def isGFX12Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
AssemblerPredicate<(all_of FeatureGFX12Insts)>;

def HasMinimum3Maximum3F32 :
Predicate<"Subtarget->hasMinimum3Maximum3F32()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3F32)>;

def HasMinimum3Maximum3F16 :
Predicate<"Subtarget->hasMinimum3Maximum3F16()">,
AssemblerPredicate<(all_of FeatureMinimum3Maximum3F16)>;


def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;

Expand Down
12 changes: 8 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1172,12 +1172,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DX10Clamp = Mode.DX10Clamp;

unsigned LDSAlignShift;
if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
// LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;
} else {
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
// LDS is allocated in 320 dword blocks.
LDSAlignShift = 11;
} else if (STM.getFeatureBits().test(
FeatureAddressableLocalMemorySize65536)) {
// LDS is allocated in 128 dword blocks.
LDSAlignShift = 9;
} else {
// LDS is allocated in 64 dword blocks.
LDSAlignShift = 8;
}

ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<

def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;

class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
"wavefrontsize"#!shl(1, ValueLog2),
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNProcessors.td
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@ def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel,
FeatureISAVersion9_4_2.Features
>;

def : ProcessorModel<"gfx950", SIDPGFX940FullSpeedModel,
FeatureISAVersion9_5_0.Features
>;

// [gfx900, gfx902, gfx904, gfx906, gfx909, gfx90c]
def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel,
FeatureISAVersion9_Generic.Features
Expand Down
12 changes: 11 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool GFX9Insts = false;
bool GFX90AInsts = false;
bool GFX940Insts = false;
bool GFX950Insts = false;
bool GFX10Insts = false;
bool GFX11Insts = false;
bool GFX12Insts = false;
Expand Down Expand Up @@ -241,7 +242,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasForceStoreSC0SC1 = false;
bool HasRequiredExportPriority = false;
bool HasVmemWriteVgprInOrder = false;

bool HasMinimum3Maximum3F32 = false;
bool HasMinimum3Maximum3F16 = false;
bool RequiresCOV6 = false;

// Dummy feature to use for assembler in tablegen.
Expand Down Expand Up @@ -1306,6 +1308,14 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// \returns true if the target has instructions with xf32 format support.
bool hasXF32Insts() const { return HasXF32Insts; }

bool hasMinimum3Maximum3F32() const {
return HasMinimum3Maximum3F32;
}

bool hasMinimum3Maximum3F16() const {
return HasMinimum3Maximum3F16;
}

/// \returns The maximum number of instructions that can be enclosed in an
/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
/// instruction.
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: AK = GK_GFX941; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: AK = GK_GFX942; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: AK = GK_GFX950; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
Expand Down Expand Up @@ -182,6 +183,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940;
case GK_GFX941: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941;
case GK_GFX942: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942;
case GK_GFX950: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX950;
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
return 32768;
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
return 65536;
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
return 163840;
return 0;
}

Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ let mayRaiseFPException = 0 in {
defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
} // End mayRaiseFPException = 0

let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
let SubtargetPredicate = HasMinimum3Maximum3F32, ReadsModeReg = 0 in {
defm V_MINIMUM3_F32 : VOP3Inst <"v_minimum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfminimum3>;
defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>;
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
Expand Down Expand Up @@ -625,7 +625,7 @@ defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3
defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;

let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
Expand Down
12 changes: 4 additions & 8 deletions llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -614,11 +614,9 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {

if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
// Extract the run of set bits starting with bit zero from the bitwise
// inverse of ImmValue, and test that the inverse of this is the same
// as the original value.
if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {

// Check if we have a leading one, then check if the whole value is a
// shifted mask.
if (ImmValue.isNegative() && ImmValue.isShiftedMask()) {
Imm = CurDAG->getTargetConstant(ImmValue.popcount() - 1, SDLoc(N), EltTy);
return true;
}
Expand Down Expand Up @@ -647,9 +645,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {

if (selectVSplat(N.getNode(), ImmValue, EltTy.getSizeInBits()) &&
ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
// Extract the run of set bits starting with bit zero, and test that the
// result is the same as the original value
if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
if (ImmValue.isMask()) {
Imm = CurDAG->getTargetConstant(ImmValue.popcount() - 1, SDLoc(N), EltTy);
return true;
}
Expand Down
22 changes: 14 additions & 8 deletions llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,12 @@ struct RISCVOperand final : public MCParsedAsmOperand {
RISCVMCRegisterClasses[RISCV::GPRRegClassID].contains(Reg.RegNum);
}

bool isGPRPair() const {
return Kind == KindTy::Register &&
RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains(
Reg.RegNum);
}

bool isGPRF16() const {
return Kind == KindTy::Register &&
RISCVMCRegisterClasses[RISCV::GPRF16RegClassID].contains(Reg.RegNum);
Expand All @@ -491,17 +497,17 @@ struct RISCVOperand final : public MCParsedAsmOperand {
RISCVMCRegisterClasses[RISCV::GPRF32RegClassID].contains(Reg.RegNum);
}

bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
bool isGPRAsFPR16() const { return isGPRF16() && Reg.IsGPRAsFPR; }
bool isGPRAsFPR32() const { return isGPRF32() && Reg.IsGPRAsFPR; }
bool isGPRPairAsFPR() const { return isGPRPair() && Reg.IsGPRAsFPR; }

bool isGPRPair() const {
bool isGPRF64Pair() const {
return Kind == KindTy::Register &&
RISCVMCRegisterClasses[RISCV::GPRPairRegClassID].contains(
RISCVMCRegisterClasses[RISCV::GPRF64PairRegClassID].contains(
Reg.RegNum);
}

bool isGPRAsFPR() const { return isGPR() && Reg.IsGPRAsFPR; }
bool isGPRAsFPR16() const { return isGPRF16() && Reg.IsGPRAsFPR; }
bool isGPRAsFPR32() const { return isGPRF32() && Reg.IsGPRAsFPR; }
bool isGPRPairAsFPR64() const { return isGPRF64Pair() && Reg.IsGPRAsFPR; }

static bool evaluateConstantImm(const MCExpr *Expr, int64_t &Imm,
RISCVMCExpr::VariantKind &VK) {
if (auto *RE = dyn_cast<RISCVMCExpr>(Expr)) {
Expand Down Expand Up @@ -2399,7 +2405,7 @@ ParseStatus RISCVAsmParser::parseGPRPairAsFPR64(OperandVector &Operands) {
const MCRegisterInfo *RI = getContext().getRegisterInfo();
MCRegister Pair = RI->getMatchingSuperReg(
Reg, RISCV::sub_gpr_even,
&RISCVMCRegisterClasses[RISCV::GPRPairRegClassID]);
&RISCVMCRegisterClasses[RISCV::GPRF64PairRegClassID]);
Operands.push_back(RISCVOperand::createReg(Pair, S, E, /*isGPRAsFPR=*/true));
return ParseStatus::Success;
}
Expand Down
24 changes: 18 additions & 6 deletions llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -952,27 +952,36 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, Res);
return;
}
case RISCVISD::BuildGPRPair:
case RISCVISD::BuildPairF64: {
if (!Subtarget->hasStdExtZdinx())
if (Opcode == RISCVISD::BuildPairF64 && !Subtarget->hasStdExtZdinx())
break;

assert(!Subtarget->is64Bit() && "Unexpected subtarget");
assert((!Subtarget->is64Bit() || Opcode == RISCVISD::BuildGPRPair) &&
"BuildPairF64 only handled here on rv32i_zdinx");

int RegClassID = (Opcode == RISCVISD::BuildGPRPair)
? RISCV::GPRPairRegClassID
: RISCV::GPRF64PairRegClassID;
MVT OutType = (Opcode == RISCVISD::BuildGPRPair) ? MVT::Untyped : MVT::f64;

SDValue Ops[] = {
CurDAG->getTargetConstant(RISCV::GPRPairRegClassID, DL, MVT::i32),
CurDAG->getTargetConstant(RegClassID, DL, MVT::i32),
Node->getOperand(0),
CurDAG->getTargetConstant(RISCV::sub_gpr_even, DL, MVT::i32),
Node->getOperand(1),
CurDAG->getTargetConstant(RISCV::sub_gpr_odd, DL, MVT::i32)};

SDNode *N =
CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::f64, Ops);
CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, OutType, Ops);
ReplaceNode(Node, N);
return;
}
case RISCVISD::SplitGPRPair:
case RISCVISD::SplitF64: {
if (Subtarget->hasStdExtZdinx()) {
assert(!Subtarget->is64Bit() && "Unexpected subtarget");
if (Subtarget->hasStdExtZdinx() || Opcode != RISCVISD::SplitF64) {
assert((!Subtarget->is64Bit() || Opcode == RISCVISD::SplitGPRPair) &&
"SplitF64 only handled here on rv32i_zdinx");

if (!SDValue(Node, 0).use_empty()) {
SDValue Lo = CurDAG->getTargetExtractSubreg(RISCV::sub_gpr_even, DL, VT,
Expand All @@ -990,6 +999,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
return;
}

assert(Opcode != RISCVISD::SplitGPRPair &&
"SplitGPRPair should already be handled");

if (!Subtarget->hasStdExtZfa())
break;
assert(Subtarget->hasStdExtD() && !Subtarget->is64Bit() &&
Expand Down
81 changes: 75 additions & 6 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
if (Subtarget.is64Bit())
addRegisterClass(MVT::f64, &RISCV::GPRRegClass);
else
addRegisterClass(MVT::f64, &RISCV::GPRPairRegClass);
addRegisterClass(MVT::f64, &RISCV::GPRF64PairRegClass);
}

static const MVT::SimpleValueType BoolVecVTs[] = {
Expand Down Expand Up @@ -2233,6 +2233,17 @@ MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
return PartVT;
}

unsigned
RISCVTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT,
std::optional<MVT> RegisterVT) const {
// Pair inline assembly operand
if (VT == (Subtarget.is64Bit() ? MVT::i128 : MVT::i64) && RegisterVT &&
*RegisterVT == MVT::Untyped)
return 1;

return TargetLowering::getNumRegisters(Context, VT, RegisterVT);
}

unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
Expand Down Expand Up @@ -20196,6 +20207,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TAIL)
NODE_NAME_CASE(SELECT_CC)
NODE_NAME_CASE(BR_CC)
NODE_NAME_CASE(BuildGPRPair)
NODE_NAME_CASE(SplitGPRPair)
NODE_NAME_CASE(BuildPairF64)
NODE_NAME_CASE(SplitF64)
NODE_NAME_CASE(ADD_LO)
Expand Down Expand Up @@ -20456,6 +20469,7 @@ RISCVTargetLowering::getConstraintType(StringRef Constraint) const {
default:
break;
case 'f':
case 'R':
return C_RegisterClass;
case 'I':
case 'J':
Expand Down Expand Up @@ -20493,7 +20507,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (VT == MVT::f32 && Subtarget.hasStdExtZfinx())
return std::make_pair(0U, &RISCV::GPRF32NoX0RegClass);
if (VT == MVT::f64 && Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit())
return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass);
return std::make_pair(0U, &RISCV::GPRF64PairNoX0RegClass);
return std::make_pair(0U, &RISCV::GPRNoX0RegClass);
case 'f':
if (VT == MVT::f16) {
Expand All @@ -20510,11 +20524,15 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Subtarget.hasStdExtD())
return std::make_pair(0U, &RISCV::FPR64RegClass);
if (Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit())
return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass);
return std::make_pair(0U, &RISCV::GPRF64PairNoX0RegClass);
if (Subtarget.hasStdExtZdinx() && Subtarget.is64Bit())
return std::make_pair(0U, &RISCV::GPRNoX0RegClass);
}
break;
case 'R':
if (VT == MVT::f64 && !Subtarget.is64Bit() && Subtarget.hasStdExtZdinx())
return std::make_pair(0U, &RISCV::GPRF64PairCRegClass);
return std::make_pair(0U, &RISCV::GPRPairNoX0RegClass);
default:
break;
}
Expand Down Expand Up @@ -20552,7 +20570,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (VT == MVT::f32 && Subtarget.hasStdExtZfinx())
return std::make_pair(0U, &RISCV::GPRF32CRegClass);
if (VT == MVT::f64 && Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit())
return std::make_pair(0U, &RISCV::GPRPairCRegClass);
return std::make_pair(0U, &RISCV::GPRF64PairCRegClass);
if (!VT.isVector())
return std::make_pair(0U, &RISCV::GPRCRegClass);
} else if (Constraint == "cf") {
Expand All @@ -20570,7 +20588,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Subtarget.hasStdExtD())
return std::make_pair(0U, &RISCV::FPR64CRegClass);
if (Subtarget.hasStdExtZdinx() && !Subtarget.is64Bit())
return std::make_pair(0U, &RISCV::GPRPairCRegClass);
return std::make_pair(0U, &RISCV::GPRF64PairCRegClass);
if (Subtarget.hasStdExtZdinx() && Subtarget.is64Bit())
return std::make_pair(0U, &RISCV::GPRCRegClass);
}
Expand Down Expand Up @@ -20734,7 +20752,7 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
// Subtarget into account.
if (Res.second == &RISCV::GPRF16RegClass ||
Res.second == &RISCV::GPRF32RegClass ||
Res.second == &RISCV::GPRPairRegClass)
Res.second == &RISCV::GPRF64PairRegClass)
return std::make_pair(Res.first, &RISCV::GPRRegClass);

return Res;
Expand Down Expand Up @@ -21360,6 +21378,16 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts(
unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
bool IsABIRegCopy = CC.has_value();
EVT ValueVT = Val.getValueType();

if (ValueVT == (Subtarget.is64Bit() ? MVT::i128 : MVT::i64) &&
NumParts == 1 && PartVT == MVT::Untyped) {
// Pairs in Inline Assembly
MVT XLenVT = Subtarget.getXLenVT();
auto [Lo, Hi] = DAG.SplitScalar(Val, DL, XLenVT, XLenVT);
Parts[0] = DAG.getNode(RISCVISD::BuildGPRPair, DL, MVT::Untyped, Lo, Hi);
return true;
}

if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
PartVT == MVT::f32) {
// Cast the [b]f16 to i16, extend to i32, pad with ones to make a float
Expand Down Expand Up @@ -21436,6 +21464,17 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
bool IsABIRegCopy = CC.has_value();

if (ValueVT == (Subtarget.is64Bit() ? MVT::i128 : MVT::i64) &&
NumParts == 1 && PartVT == MVT::Untyped) {
// Pairs in Inline Assembly
MVT XLenVT = Subtarget.getXLenVT();
SDValue Res = DAG.getNode(RISCVISD::SplitGPRPair, DL,
DAG.getVTList(XLenVT, XLenVT), Parts[0]);
return DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Res.getValue(0),
Res.getValue(1));
}

if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
PartVT == MVT::f32) {
SDValue Val = Parts[0];
Expand Down Expand Up @@ -22012,6 +22051,36 @@ SDValue RISCVTargetLowering::expandIndirectJTBranch(const SDLoc &dl,
return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
}

// If an output pattern produces multiple instructions tablegen may pick an
// arbitrary type from an instructions destination register class to use for the
// VT of that MachineSDNode. This VT may be used to look up the representative
// register class. If the type isn't legal, the default implementation will
// not find a register class.
//
// Some integer types smaller than XLen are listed in the GPR register class to
// support isel patterns for GISel, but are not legal in SelectionDAG. The
// arbitrary type tablegen picks may be one of these smaller types.
//
// f16 and bf16 are both valid for the FPR16 or GPRF16 register class. It's
// possible for tablegen to pick bf16 as the arbitrary type for an f16 pattern.
std::pair<const TargetRegisterClass *, uint8_t>
RISCVTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
MVT VT) const {
switch (VT.SimpleTy) {
default:
break;
case MVT::i8:
case MVT::i16:
case MVT::i32:
return TargetLowering::findRepresentativeClass(TRI, Subtarget.getXLenVT());
case MVT::bf16:
case MVT::f16:
return TargetLowering::findRepresentativeClass(TRI, MVT::f32);
}

return TargetLowering::findRepresentativeClass(TRI, VT);
}

namespace llvm::RISCVVIntrinsicsTable {

#define GET_RISCVVIntrinsicsTable_IMPL
Expand Down
20 changes: 20 additions & 0 deletions llvm/lib/Target/RISCV/RISCVISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,18 @@ enum NodeType : unsigned {
SELECT_CC,
BR_CC,

/// Turn a pair of `i<xlen>`s into an even-odd register pair (`untyped`).
/// - Output: `untyped` even-odd register pair
/// - Input 0: `i<xlen>` low-order bits, for even register.
/// - Input 1: `i<xlen>` high-order bits, for odd register.
BuildGPRPair,

/// Turn an even-odd register pair (`untyped`) into a pair of `i<xlen>`s.
/// - Output 0: `i<xlen>` low-order bits, from even register.
/// - Output 1: `i<xlen>` high-order bits, from odd register.
/// - Input: `untyped` even-odd register pair
SplitGPRPair,

/// Turns a pair of `i32`s into an `f64`. Needed for rv32d/ilp32.
/// - Output: `f64`.
/// - Input 0: low-order bits (31-0) (as `i32`), for even register.
Expand Down Expand Up @@ -547,6 +559,11 @@ class RISCVTargetLowering : public TargetLowering {
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
EVT VT) const override;

/// Return the number of registers for a given MVT, for inline assembly
unsigned
getNumRegisters(LLVMContext &Context, EVT VT,
std::optional<MVT> RegisterVT = std::nullopt) const override;

/// Return the number of registers for a given MVT, ensuring vectors are
/// treated as a series of gpr sized integers.
unsigned getNumRegistersForCallingConv(LLVMContext &Context,
Expand Down Expand Up @@ -1051,6 +1068,9 @@ class RISCVTargetLowering : public TargetLowering {

SDValue emitFlushICache(SelectionDAG &DAG, SDValue InChain, SDValue Start,
SDValue End, SDValue Flags, SDLoc DL) const;

std::pair<const TargetRegisterClass *, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override;
};

namespace RISCVVIntrinsicsTable {
Expand Down
12 changes: 6 additions & 6 deletions llvm/lib/Target/RISCV/RISCVInstrInfoD.td
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def AddrRegImmINX : ComplexPattern<iPTR, 2, "SelectAddrRegImmRV32Zdinx">;
def GPRPairAsFPR : AsmOperandClass {
let Name = "GPRPairAsFPR";
let ParserMethod = "parseGPRPairAsFPR64";
let PredicateMethod = "isGPRPairAsFPR";
let PredicateMethod = "isGPRPairAsFPR64";
let RenderMethod = "addRegOperands";
}

Expand All @@ -52,7 +52,7 @@ def FPR64INX : RegisterOperand<GPR> {
let DecoderMethod = "DecodeGPRRegisterClass";
}

def FPR64IN32X : RegisterOperand<GPRPair> {
def FPR64IN32X : RegisterOperand<GPRF64Pair> {
let ParserMatchClass = GPRPairAsFPR;
}

Expand Down Expand Up @@ -523,15 +523,15 @@ def PseudoFROUND_D_IN32X : PseudoFROUND<FPR64IN32X, f64>;

/// Loads
let isCall = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 1 in
def PseudoRV32ZdinxLD : Pseudo<(outs GPRPair:$dst), (ins GPR:$rs1, simm12:$imm12), []>;
def PseudoRV32ZdinxLD : Pseudo<(outs GPRF64Pair:$dst), (ins GPR:$rs1, simm12:$imm12), []>;
def : Pat<(f64 (load (AddrRegImmINX (XLenVT GPR:$rs1), simm12:$imm12))),
(PseudoRV32ZdinxLD GPR:$rs1, simm12:$imm12)>;

/// Stores
let isCall = 0, mayLoad = 0, mayStore = 1, Size = 8, isCodeGenOnly = 1 in
def PseudoRV32ZdinxSD : Pseudo<(outs), (ins GPRPair:$rs2, GPRNoX0:$rs1, simm12:$imm12), []>;
def : Pat<(store (f64 GPRPair:$rs2), (AddrRegImmINX (XLenVT GPR:$rs1), simm12:$imm12)),
(PseudoRV32ZdinxSD GPRPair:$rs2, GPR:$rs1, simm12:$imm12)>;
def PseudoRV32ZdinxSD : Pseudo<(outs), (ins GPRF64Pair:$rs2, GPRNoX0:$rs1, simm12:$imm12), []>;
def : Pat<(store (f64 GPRF64Pair:$rs2), (AddrRegImmINX (XLenVT GPR:$rs1), simm12:$imm12)),
(PseudoRV32ZdinxSD GPRF64Pair:$rs2, GPR:$rs1, simm12:$imm12)>;
} // Predicates = [HasStdExtZdinx, IsRV32]

let Predicates = [HasStdExtD, IsRV32] in {
Expand Down
23 changes: 20 additions & 3 deletions llvm/lib/Target/RISCV/RISCVRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ let RegAltNameIndices = [ABIRegAltName] in {

def XLenVT : ValueTypeByHwMode<[RV32, RV64],
[i32, i64]>;
defvar XLenPairVT = untyped;

// Allow f64 in GPR for ZDINX on RV64.
def XLenFVT : ValueTypeByHwMode<[RV64],
[f64]>;
Expand Down Expand Up @@ -323,7 +325,7 @@ let RegAltNameIndices = [ABIRegAltName] in {

let RegInfos = XLenPairRI,
DecoderMethod = "DecodeGPRPairRegisterClass" in {
def GPRPair : RISCVRegisterClass<[XLenPairFVT], 64, (add
def GPRPair : RISCVRegisterClass<[XLenPairVT], 64, (add
X10_X11, X12_X13, X14_X15, X16_X17,
X6_X7,
X28_X29, X30_X31,
Expand All @@ -332,11 +334,11 @@ def GPRPair : RISCVRegisterClass<[XLenPairFVT], 64, (add
X0_Pair, X2_X3, X4_X5
)>;

def GPRPairNoX0 : RISCVRegisterClass<[XLenPairFVT], 64, (sub GPRPair, X0_Pair)>;
def GPRPairNoX0 : RISCVRegisterClass<[XLenPairVT], 64, (sub GPRPair, X0_Pair)>;
} // let RegInfos = XLenPairRI, DecoderMethod = "DecodeGPRPairRegisterClass"

let RegInfos = XLenPairRI in
def GPRPairC : RISCVRegisterClass<[XLenPairFVT], 64, (add
def GPRPairC : RISCVRegisterClass<[XLenPairVT], 64, (add
X10_X11, X12_X13, X14_X15, X8_X9
)>;

Expand Down Expand Up @@ -462,6 +464,21 @@ def GPRF32C : RISCVRegisterClass<[f32], 32, (add (sequence "X%u_W", 10, 15),
(sequence "X%u_W", 8, 9))>;
def GPRF32NoX0 : RISCVRegisterClass<[f32], 32, (sub GPRF32, X0_W)>;

let DecoderMethod = "DecodeGPRPairRegisterClass" in
def GPRF64Pair : RISCVRegisterClass<[XLenPairFVT], 64, (add
X10_X11, X12_X13, X14_X15, X16_X17,
X6_X7,
X28_X29, X30_X31,
X8_X9,
X18_X19, X20_X21, X22_X23, X24_X25, X26_X27,
X0_Pair, X2_X3, X4_X5
)>;

def GPRF64PairC : RISCVRegisterClass<[XLenPairFVT], 64, (add
X10_X11, X12_X13, X14_X15, X8_X9
)>;

def GPRF64PairNoX0 : RISCVRegisterClass<[XLenPairFVT], 64, (sub GPRF64Pair, X0_Pair)>;

//===----------------------------------------------------------------------===//
// Vector type mapping to LLVM types.
Expand Down
11 changes: 9 additions & 2 deletions llvm/lib/TargetParser/TargetParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ constexpr GPUInfo AMDGCNGPUs[] = {
{{"gfx940"}, {"gfx940"}, GK_GFX940, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx941"}, {"gfx941"}, GK_GFX941, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx942"}, {"gfx942"}, GK_GFX942, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx950"}, {"gfx950"}, GK_GFX950, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC},
{{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
{{"gfx1011"}, {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
{{"gfx1012"}, {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP},
Expand Down Expand Up @@ -262,6 +263,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
case GK_GFX940: return {9, 4, 0};
case GK_GFX941: return {9, 4, 1};
case GK_GFX942: return {9, 4, 2};
case GK_GFX950: return {9, 5, 0};
case GK_GFX1010: return {10, 1, 0};
case GK_GFX1011: return {10, 1, 1};
case GK_GFX1012: return {10, 1, 2};
Expand Down Expand Up @@ -361,7 +363,8 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["wavefrontsize32"] = true;
Features["wavefrontsize64"] = true;
} else if (T.isAMDGCN()) {
switch (parseArchAMDGCN(GPU)) {
AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU);
switch (Kind) {
case GK_GFX1201:
case GK_GFX1200:
case GK_GFX12_GENERIC:
Expand Down Expand Up @@ -466,12 +469,16 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["s-memtime-inst"] = true;
Features["gws"] = true;
break;
case GK_GFX950:
Features["gfx950-insts"] = true;
[[fallthrough]];
case GK_GFX942:
case GK_GFX941:
case GK_GFX940:
Features["fp8-insts"] = true;
Features["fp8-conversion-insts"] = true;
Features["xf32-insts"] = true;
if (Kind != GK_GFX950)
Features["xf32-insts"] = true;
[[fallthrough]];
case GK_GFX9_4_GENERIC:
Features["gfx940-insts"] = true;
Expand Down
345 changes: 232 additions & 113 deletions llvm/test/CodeGen/AMDGPU/bf16-conversions.ll

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX942-NOXNACK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX942-XNACK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX950-NOXNACK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX950-XNACK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX1010 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-xnack < %s | FileCheck --check-prefixes=GFX1010-NOXNACK %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+xnack < %s | FileCheck --check-prefixes=GFX1010-XNACK %s
Expand Down Expand Up @@ -180,6 +183,9 @@
; GFX942: .amdgcn_target "amdgcn-amd-amdhsa--gfx942"
; GFX942-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack-"
; GFX942-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx942:xnack+"
; GFX950: .amdgcn_target "amdgcn-amd-amdhsa--gfx950"
; GFX950-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack-"
; GFX950-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx950:xnack+"
; GFX1010: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010"
; GFX1010-NOXNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack-"
; GFX1010-XNACK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1010:xnack+"
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX940 %s
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx941 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX941 %s
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx942 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX942 %s
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX950 %s
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1010 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1011 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1011 %s
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1012 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1012 %s
Expand Down Expand Up @@ -139,6 +140,7 @@
; GFX940: EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40)
; GFX941: EF_AMDGPU_MACH_AMDGCN_GFX941 (0x4B)
; GFX942: EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C)
; GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
; GFX1011: EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34)
; GFX1012: EF_AMDGPU_MACH_AMDGCN_GFX1012 (0x35)
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/elf-header-flags-sramecc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx940 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX940 %s

; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s
; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx950 -mattr=+sramecc < %s | llvm-readobj --file-header - | FileCheck --check-prefix=SRAM-ECC-GFX950 %s

; NO-SRAM-ECC-GFX906: Flags [
; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_FEATURE_XNACK_V3 (0x100)
; NO-SRAM-ECC-GFX906-NEXT: EF_AMDGPU_MACH_AMDGCN_GFX906 (0x2F)
Expand Down Expand Up @@ -44,6 +47,11 @@
; SRAM-ECC-GFX940: EF_AMDGPU_MACH_AMDGCN_GFX940 (0x40)
; SRAM-ECC-GFX940: ]

; SRAM-ECC-GFX950: Flags [
; SRAM-ECC-GFX950: EF_AMDGPU_FEATURE_SRAMECC_V3 (0x200)
; SRAM-ECC-GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
; SRAM-ECC-GFX950: ]

define amdgpu_kernel void @elf_header() {
ret void
}
7 changes: 7 additions & 0 deletions llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10-MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-PAL %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11-MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-PAL %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s

Expand All @@ -17,6 +19,11 @@
; GFX11-MESA: .long 45100
; GFX11-MESA-NEXT: .long 1024

; GFX950-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200

; GFX950-MESA: .long 45100
; GFX950-MESA-NEXT: .long 512

; GFX1200-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x400

; GFX1200-MESA: .long 45100
Expand Down
594 changes: 437 additions & 157 deletions llvm/test/CodeGen/AMDGPU/fmaximum3.ll

Large diffs are not rendered by default.

594 changes: 437 additions & 157 deletions llvm/test/CodeGen/AMDGPU/fminimum3.ll

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT160K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-4-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx941 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90c -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT64K %s
; RUN: not llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERROR-LIMIT32K %s

; gfx950 supports upto 160 KB LDS memory. The generic target does not.
; This is a negative test to check when the LDS size exceeds the max usable limit.

; ERROR-LIMIT160K: error: <unknown>:0:0: local memory (163844) exceeds limit (163840) in function 'test_lds_limit'
; ERROR-LIMIT64K: error: <unknown>:0:0: local memory (163844) exceeds limit (65536) in function 'test_lds_limit'
; ERROR-LIMIT32K: error: <unknown>:0:0: local memory (163844) exceeds limit (32768) in function 'test_lds_limit'
@dst = addrspace(3) global [40961 x i32] poison

define amdgpu_kernel void @test_lds_limit(i32 %val) {
%gep = getelementptr [40961 x i32], ptr addrspace(3) @dst, i32 0, i32 100
store i32 %val, ptr addrspace(3) %gep
ret void
}
31 changes: 31 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=MESA %s

; gfx950 supports upto 160 KB configurable LDS memory.
; This test checks the max and above the old i.e. 128 KiB size of LDS that can be allocated.

@lds.i32 = addrspace(3) global i32 poison
@lds.array.size.131076 = addrspace(3) global [32768 x i32] poison
@lds.array.size.163840 = addrspace(3) global [40959 x i32] poison

; GCN-LABEL: test_lds_array_size_131076:
; GCN: .amdhsa_group_segment_fixed_size 131076
; GCN: ; LDSByteSize: 131076 bytes/workgroup
; MESA: granulated_lds_size = 65
define amdgpu_kernel void @test_lds_array_size_131076() {
%gep = getelementptr inbounds [32768 x i32], ptr addrspace(3) @lds.array.size.131076, i32 0, i32 20
%val = load i32, ptr addrspace(3) %gep
store i32 %val, ptr addrspace(3) @lds.i32
ret void
}

; GCN-LABEL: test_lds_array_size_163840:
; GCN: .amdhsa_group_segment_fixed_size 163840
; GCN: ; LDSByteSize: 163840 bytes/workgroup
; MESA: granulated_lds_size = 80
define amdgpu_kernel void @test_lds_array_size_163840() {
%gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.size.163840 , i32 0, i32 20
%val = load i32, ptr addrspace(3) %gep
store i32 %val, ptr addrspace(3) @lds.i32
ret void
}
26 changes: 26 additions & 0 deletions llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=PAL %s

; GFX950supports upto 160 KB configurable LDS memory.
; This test checks the min and max size of LDS that can be allocated.

; PAL: .shader_functions:
; PAL: test_lds_array_i32:
; PAL: .lds_size: 0x28000
; PAL: test_lds_i32:
; PAL: .lds_size: 0x4


@lds.i32 = addrspace(3) global i32 poison
@lds.array.i32 = addrspace(3) global [40959 x i32] poison

define amdgpu_gfx void @test_lds_i32(i32 %val) {
store i32 %val, ptr addrspace(3) @lds.i32
ret void
}

define amdgpu_gfx void @test_lds_array_i32() {
%gep = getelementptr inbounds [40959 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
%val = load i32, ptr addrspace(3) %gep
store i32 %val, ptr addrspace(3) @lds.i32
ret void
}
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s

Expand Down
1,224 changes: 586 additions & 638 deletions llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll

Large diffs are not rendered by default.

1,113 changes: 527 additions & 586 deletions llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll

Large diffs are not rendered by default.

1,569 changes: 755 additions & 814 deletions llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll

Large diffs are not rendered by default.

1,223 changes: 586 additions & 637 deletions llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll

Large diffs are not rendered by default.

1,113 changes: 527 additions & 586 deletions llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll

Large diffs are not rendered by default.

1,569 changes: 755 additions & 814 deletions llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll

Large diffs are not rendered by default.

61 changes: 37 additions & 24 deletions llvm/test/CodeGen/NVPTX/bf16x2-instructions-approx.ll
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-11.8 %{ llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx71 --enable-unsafe-fp-math | %ptxas-verify -arch=sm_80 %}

Expand All @@ -6,36 +7,48 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
declare <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a) #0
declare <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a) #0

; CHECK-LABEL: test_sin(
; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_sin_param_0];
; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
; CHECK-DAG: cvt.f32.bf16 [[AF0:%f[0-9]+]], [[A0]];
; CHECK-DAG: cvt.f32.bf16 [[AF1:%f[0-9]+]], [[A1]];
; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]];
; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]];
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
; CHECK: st.param.b32 [func_retval0], [[R]];
; CHECK: ret;
define <2 x bfloat> @test_sin(<2 x bfloat> %a) #0 #1 {
; CHECK-LABEL: test_sin(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_sin_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2;
; CHECK-NEXT: sin.approx.f32 %f2, %f1;
; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs1;
; CHECK-NEXT: sin.approx.f32 %f4, %f3;
; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%r = call <2 x bfloat> @llvm.sin.f16(<2 x bfloat> %a)
ret <2 x bfloat> %r
}

; CHECK-LABEL: test_cos(
; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_cos_param_0];
; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]]
; CHECK-DAG: cvt.f32.bf16 [[AF0:%f[0-9]+]], [[A0]];
; CHECK-DAG: cvt.f32.bf16 [[AF1:%f[0-9]+]], [[A1]];
; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]];
; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]];
; CHECK-DAG: cvt.rn.bf16.f32 [[R0:%rs[0-9]+]], [[RF0]];
; CHECK-DAG: cvt.rn.bf16.f32 [[R1:%rs[0-9]+]], [[RF1]];
; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]}
; CHECK: st.param.b32 [func_retval0], [[R]];
; CHECK: ret;
define <2 x bfloat> @test_cos(<2 x bfloat> %a) #0 #1 {
; CHECK-LABEL: test_cos(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<5>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .f32 %f<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [test_cos_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2;
; CHECK-NEXT: cos.approx.f32 %f2, %f1;
; CHECK-NEXT: cvt.rn.bf16.f32 %rs3, %f2;
; CHECK-NEXT: cvt.f32.bf16 %f3, %rs1;
; CHECK-NEXT: cos.approx.f32 %f4, %f3;
; CHECK-NEXT: cvt.rn.bf16.f32 %rs4, %f4;
; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3};
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%r = call <2 x bfloat> @llvm.cos.f16(<2 x bfloat> %a)
ret <2 x bfloat> %r
}
Expand Down
1,027 changes: 683 additions & 344 deletions llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll

Large diffs are not rendered by default.

2,858 changes: 1,870 additions & 988 deletions llvm/test/CodeGen/NVPTX/f16x2-instructions.ll

Large diffs are not rendered by default.

1,110 changes: 701 additions & 409 deletions llvm/test/CodeGen/NVPTX/i16x2-instructions.ll

Large diffs are not rendered by default.

41 changes: 27 additions & 14 deletions llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \
; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
; RUN: | FileCheck %s
; RUN: %if ptxas %{ \
Expand All @@ -9,25 +10,37 @@

target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"

; CHECK-LABEL: test_bitcast_2xi8_i16(
; CHECK: ld.param.u32 %r1, [test_bitcast_2xi8_i16_param_0];
; CHECK: mov.b32 {%rs1, %rs2}, %r1;
; CHECK: shl.b16 %rs3, %rs2, 8;
; CHECK: and.b16 %rs4, %rs1, 255;
; CHECK: or.b16 %rs5, %rs4, %rs3;
; CHECK: cvt.u32.u16 %r2, %rs5;
; CHECK: st.param.b32 [func_retval0], %r2;
define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) {
; CHECK-LABEL: test_bitcast_2xi8_i16(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<6>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xi8_i16_param_0];
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
; CHECK-NEXT: shl.b16 %rs3, %rs2, 8;
; CHECK-NEXT: and.b16 %rs4, %rs1, 255;
; CHECK-NEXT: or.b16 %rs5, %rs4, %rs3;
; CHECK-NEXT: cvt.u32.u16 %r2, %rs5;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%res = bitcast <2 x i8> %a to i16
ret i16 %res
}

; CHECK-LABEL: test_bitcast_i16_2xi8(
; CHECK: ld.param.u16 %rs1, [test_bitcast_i16_2xi8_param_0];
; CHECK: shr.u16 %rs2, %rs1, 8;
; CHECK: mov.b32 %r1, {%rs1, %rs2};
; CHECK: st.param.b32 [func_retval0], %r1;
define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) {
; CHECK-LABEL: test_bitcast_i16_2xi8(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u16 %rs1, [test_bitcast_i16_2xi8_param_0];
; CHECK-NEXT: shr.u16 %rs2, %rs1, 8;
; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-NEXT: ret;
%res = bitcast i16 %a to <2 x i8>
ret <2 x i8> %res
}
73 changes: 73 additions & 0 deletions llvm/test/CodeGen/RISCV/rv32-inline-asm-pairs.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
; RUN: | FileCheck %s

define i64 @test_Pr_wide_scalar_simple(i64 noundef %0) nounwind {
; CHECK-LABEL: test_Pr_wide_scalar_simple:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: #APP
; CHECK-NEXT: # a2 <- a0
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: mv a1, a3
; CHECK-NEXT: ret
entry:
%1 = call i64 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i64 %0)
ret i64 %1
}

define i32 @test_Pr_wide_scalar_with_ops(i32 noundef %0) nounwind {
; CHECK-LABEL: test_Pr_wide_scalar_with_ops:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mv a1, a0
; CHECK-NEXT: #APP
; CHECK-NEXT: # a2 <- a0
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: or a0, a2, a3
; CHECK-NEXT: ret
entry:
%1 = zext i32 %0 to i64
%2 = shl i64 %1, 32
%3 = or i64 %1, %2
%4 = call i64 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i64 %3)
%5 = trunc i64 %4 to i32
%6 = lshr i64 %4, 32
%7 = trunc i64 %6 to i32
%8 = or i32 %5, %7
ret i32 %8
}

define i64 @test_Pr_wide_scalar_inout(ptr %0, i64 noundef %1) nounwind {
; CHECK-LABEL: test_Pr_wide_scalar_inout:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: mv a3, a2
; CHECK-NEXT: sw a0, 12(sp)
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: sw a1, 0(sp)
; CHECK-NEXT: sw a3, 4(sp)
; CHECK-NEXT: #APP
; CHECK-NEXT: # a0; a2
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: sw a0, 12(sp)
; CHECK-NEXT: sw a2, 0(sp)
; CHECK-NEXT: sw a3, 4(sp)
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: mv a1, a3
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
entry:
%2 = alloca ptr, align 4
%3 = alloca i64, align 8
store ptr %0, ptr %2, align 4
store i64 %1, ptr %3, align 8
%4 = load ptr, ptr %2, align 4
%5 = load i64, ptr %3, align 8
%6 = call { ptr, i64 } asm sideeffect "/* $0; $1 */", "=r,=R,0,1"(ptr %4, i64 %5)
%7 = extractvalue { ptr, i64} %6, 0
%8 = extractvalue { ptr, i64 } %6, 1
store ptr %7, ptr %2, align 4
store i64 %8, ptr %3, align 8
%9 = load i64, ptr %3, align 8
ret i64 %9
}
73 changes: 73 additions & 0 deletions llvm/test/CodeGen/RISCV/rv64-inline-asm-pairs.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
; RUN: | FileCheck %s

define i128 @test_R_wide_scalar_simple(i128 noundef %0) nounwind {
; CHECK-LABEL: test_R_wide_scalar_simple:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: #APP
; CHECK-NEXT: # a2 <- a0
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: mv a1, a3
; CHECK-NEXT: ret
entry:
%1 = call i128 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i128 %0)
ret i128 %1
}

define i64 @test_R_wide_scalar_with_ops(i64 noundef %0) nounwind {
; CHECK-LABEL: test_R_wide_scalar_with_ops:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: mv a1, a0
; CHECK-NEXT: #APP
; CHECK-NEXT: # a2 <- a0
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: or a0, a2, a3
; CHECK-NEXT: ret
entry:
%1 = zext i64 %0 to i128
%2 = shl i128 %1, 64
%3 = or i128 %1, %2
%4 = call i128 asm sideeffect "/* $0 <- $1 */", "=&R,R"(i128 %3)
%5 = trunc i128 %4 to i64
%6 = lshr i128 %4, 64
%7 = trunc i128 %6 to i64
%8 = or i64 %5, %7
ret i64 %8
}

define i128 @test_R_wide_scalar_inout(ptr %0, i128 noundef %1) nounwind {
; CHECK-LABEL: test_R_wide_scalar_inout:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi sp, sp, -32
; CHECK-NEXT: mv a3, a2
; CHECK-NEXT: sd a0, 24(sp)
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: sd a1, 0(sp)
; CHECK-NEXT: sd a3, 8(sp)
; CHECK-NEXT: #APP
; CHECK-NEXT: # a0; a2
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: sd a0, 24(sp)
; CHECK-NEXT: sd a2, 0(sp)
; CHECK-NEXT: sd a3, 8(sp)
; CHECK-NEXT: mv a0, a2
; CHECK-NEXT: mv a1, a3
; CHECK-NEXT: addi sp, sp, 32
; CHECK-NEXT: ret
entry:
%2 = alloca ptr, align 8
%3 = alloca i128, align 16
store ptr %0, ptr %2, align 8
store i128 %1, ptr %3, align 16
%4 = load ptr, ptr %2, align 8
%5 = load i128, ptr %3, align 16
%6 = call { ptr, i128 } asm sideeffect "/* $0; $1 */", "=r,=R,0,1"(ptr %4, i128 %5)
%7 = extractvalue { ptr, i128} %6, 0
%8 = extractvalue { ptr, i128 } %6, 1
store ptr %7, ptr %2, align 8
store i128 %8, ptr %3, align 16
%9 = load i128, ptr %3, align 16
ret i128 %9
}
16 changes: 14 additions & 2 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,18 @@ define void @vpstore_v4i8(<4 x i8> %val, ptr %ptr, <4 x i1> %m, i32 zeroext %evl
ret void
}

declare void @llvm.vp.store.v8i7.v8i7.p0(<8 x i7>, ptr, <8 x i1>, i32)

define void @vpstore_v8i7(<8 x i7> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpstore_v8i7:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
; CHECK-NEXT: vse8.v v8, (a0), v0.t
; CHECK-NEXT: ret
call void @llvm.vp.store.v8i7.v8i7.p0(<8 x i7> %val, ptr %ptr, <8 x i1> %m, i32 %evl)
ret void
}

declare void @llvm.vp.store.v8i8.p0(<8 x i8>, ptr, <8 x i1>, i32)

define void @vpstore_v8i8(<8 x i8> %val, ptr %ptr, <8 x i1> %m, i32 zeroext %evl) {
Expand Down Expand Up @@ -285,10 +297,10 @@ define void @vpstore_v32f64(<32 x double> %val, ptr %ptr, <32 x i1> %m, i32 zero
; CHECK: # %bb.0:
; CHECK-NEXT: li a3, 16
; CHECK-NEXT: mv a2, a1
; CHECK-NEXT: bltu a1, a3, .LBB23_2
; CHECK-NEXT: bltu a1, a3, .LBB24_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: li a2, 16
; CHECK-NEXT: .LBB23_2:
; CHECK-NEXT: .LBB24_2:
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a0), v0.t
; CHECK-NEXT: addi a2, a1, -16
Expand Down
28 changes: 20 additions & 8 deletions llvm/test/CodeGen/RISCV/rvv/vpload.ll
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ define <vscale x 3 x i8> @vpload_nxv3i8(ptr %ptr, <vscale x 3 x i1> %m, i32 zero
ret <vscale x 3 x i8> %load
}

declare <vscale x 4 x i6> @llvm.vp.load.nxv4i6.nxv4i6.p0(<vscale x 4 x i6>*, <vscale x 4 x i1>, i32)

define <vscale x 4 x i6> @vpload_nxv4i6(<vscale x 4 x i6>* %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpload_nxv4i6:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
; CHECK-NEXT: vle8.v v8, (a0), v0.t
; CHECK-NEXT: ret
%load = call <vscale x 4 x i6> @llvm.vp.load.nxv4i6.nxv4i6.p0(<vscale x 4 x i6>* %ptr, <vscale x 4 x i1> %m, i32 %evl)
ret <vscale x 4 x i6> %load
}

declare <vscale x 4 x i8> @llvm.vp.load.nxv4i8.p0(ptr, <vscale x 4 x i1>, i32)

define <vscale x 4 x i8> @vpload_nxv4i8(ptr %ptr, <vscale x 4 x i1> %m, i32 zeroext %evl) {
Expand Down Expand Up @@ -523,10 +535,10 @@ define <vscale x 16 x double> @vpload_nxv16f64(ptr %ptr, <vscale x 16 x i1> %m,
; CHECK-NEXT: add a4, a0, a4
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v16, (a4), v0.t
; CHECK-NEXT: bltu a1, a2, .LBB43_2
; CHECK-NEXT: bltu a1, a2, .LBB44_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a1, a2
; CHECK-NEXT: .LBB43_2:
; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0), v0.t
Expand All @@ -553,10 +565,10 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
; CHECK-NEXT: csrr a3, vlenb
; CHECK-NEXT: slli a5, a3, 1
; CHECK-NEXT: mv a4, a2
; CHECK-NEXT: bltu a2, a5, .LBB44_2
; CHECK-NEXT: bltu a2, a5, .LBB45_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a4, a5
; CHECK-NEXT: .LBB44_2:
; CHECK-NEXT: .LBB45_2:
; CHECK-NEXT: sub a6, a4, a3
; CHECK-NEXT: slli a7, a3, 3
; CHECK-NEXT: srli t0, a3, 3
Expand All @@ -572,21 +584,21 @@ define <vscale x 16 x double> @vpload_nxv17f64(ptr %ptr, ptr %out, <vscale x 17
; CHECK-NEXT: sltu a2, a2, a5
; CHECK-NEXT: addi a2, a2, -1
; CHECK-NEXT: and a2, a2, a5
; CHECK-NEXT: bltu a2, a3, .LBB44_4
; CHECK-NEXT: bltu a2, a3, .LBB45_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: mv a2, a3
; CHECK-NEXT: .LBB44_4:
; CHECK-NEXT: .LBB45_4:
; CHECK-NEXT: slli a5, a3, 4
; CHECK-NEXT: srli a6, a3, 2
; CHECK-NEXT: vsetvli a7, zero, e8, mf2, ta, ma
; CHECK-NEXT: vslidedown.vx v0, v8, a6
; CHECK-NEXT: add a5, a0, a5
; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v24, (a5), v0.t
; CHECK-NEXT: bltu a4, a3, .LBB44_6
; CHECK-NEXT: bltu a4, a3, .LBB45_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: mv a4, a3
; CHECK-NEXT: .LBB44_6:
; CHECK-NEXT: .LBB45_6:
; CHECK-NEXT: vmv1r.v v0, v8
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0), v0.t
Expand Down
28 changes: 20 additions & 8 deletions llvm/test/CodeGen/RISCV/rvv/vpstore.ll
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,18 @@ define void @vpstore_nxv4i16(<vscale x 4 x i16> %val, ptr %ptr, <vscale x 4 x i1
ret void
}

declare void @llvm.vp.store.nxv8i12.nxv8i12.p0(<vscale x 8 x i12>, <vscale x 8 x i12>*, <vscale x 8 x i1>, i32)

define void @vpstore_nxv8i12(<vscale x 8 x i12> %val, <vscale x 8 x i12>* %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: vpstore_nxv8i12:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma
; CHECK-NEXT: vse16.v v8, (a0), v0.t
; CHECK-NEXT: ret
call void @llvm.vp.store.nxv8i12.nxv8i12.p0(<vscale x 8 x i12> %val, <vscale x 8 x i12>* %ptr, <vscale x 8 x i1> %m, i32 %evl)
ret void
}

declare void @llvm.vp.store.nxv8i16.p0(<vscale x 8 x i16>, ptr, <vscale x 8 x i1>, i32)

define void @vpstore_nxv8i16(<vscale x 8 x i16> %val, ptr %ptr, <vscale x 8 x i1> %m, i32 zeroext %evl) {
Expand Down Expand Up @@ -421,10 +433,10 @@ define void @vpstore_nxv16f64(<vscale x 16 x double> %val, ptr %ptr, <vscale x 1
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: mv a3, a1
; CHECK-NEXT: bltu a1, a2, .LBB34_2
; CHECK-NEXT: bltu a1, a2, .LBB35_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a3, a2
; CHECK-NEXT: .LBB34_2:
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a0), v0.t
; CHECK-NEXT: srli a3, a2, 3
Expand Down Expand Up @@ -462,15 +474,15 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
; CHECK-NEXT: csrr a3, vlenb
; CHECK-NEXT: slli a4, a3, 1
; CHECK-NEXT: mv a5, a2
; CHECK-NEXT: bltu a2, a4, .LBB35_2
; CHECK-NEXT: bltu a2, a4, .LBB36_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a5, a4
; CHECK-NEXT: .LBB35_2:
; CHECK-NEXT: .LBB36_2:
; CHECK-NEXT: mv a6, a5
; CHECK-NEXT: bltu a5, a3, .LBB35_4
; CHECK-NEXT: bltu a5, a3, .LBB36_4
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: mv a6, a3
; CHECK-NEXT: .LBB35_4:
; CHECK-NEXT: .LBB36_4:
; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: vl8re64.v v16, (a0)
; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, ma
Expand All @@ -492,10 +504,10 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
; CHECK-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload
; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma
; CHECK-NEXT: vse64.v v8, (a6), v0.t
; CHECK-NEXT: bltu a0, a3, .LBB35_6
; CHECK-NEXT: bltu a0, a3, .LBB36_6
; CHECK-NEXT: # %bb.5:
; CHECK-NEXT: mv a0, a3
; CHECK-NEXT: .LBB35_6:
; CHECK-NEXT: .LBB36_6:
; CHECK-NEXT: slli a2, a3, 4
; CHECK-NEXT: srli a3, a3, 2
; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, ma
Expand Down
28 changes: 27 additions & 1 deletion llvm/test/CodeGen/RISCV/zdinx-asm-constraint.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
; RUN: llc -mtriple=riscv32 -mattr=+zdinx -verify-machineinstrs < %s \
; RUN: -target-abi=ilp32 -mattr=+zhinx | FileCheck %s

;; These tests cover the use of `r` and `cr` constraints for floating point values on rv32.
;; These tests cover the use of `r`, `R`, and `cr` constraints for floating point values on rv32.
;;
;; In particular, there is significant complexity around using paired GPRs for double values on rv32.

Expand All @@ -26,6 +26,32 @@ entry:
ret void
}

define dso_local void @zdinx_asm_R(ptr nocapture noundef writeonly %a, double noundef %b, double noundef %c) nounwind {
; CHECK-LABEL: zdinx_asm_R:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: sw s0, 12(sp) # 4-byte Folded Spill
; CHECK-NEXT: sw s1, 8(sp) # 4-byte Folded Spill
; CHECK-NEXT: mv a5, a4
; CHECK-NEXT: mv s1, a2
; CHECK-NEXT: mv a4, a3
; CHECK-NEXT: mv s0, a1
; CHECK-NEXT: #APP
; CHECK-NEXT: fsgnjx.d a2, s0, a4
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: sw a2, 8(a0)
; CHECK-NEXT: sw a3, 12(a0)
; CHECK-NEXT: lw s0, 12(sp) # 4-byte Folded Reload
; CHECK-NEXT: lw s1, 8(sp) # 4-byte Folded Reload
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
entry:
%arrayidx = getelementptr inbounds double, ptr %a, i32 1
%0 = tail call double asm "fsgnjx.d $0, $1, $2", "=R,R,R"(double %b, double %c)
store double %0, ptr %arrayidx, align 8
ret void
}

define dso_local void @zfinx_asm(ptr nocapture noundef writeonly %a, float noundef %b, float noundef %c) nounwind {
; CHECK-LABEL: zfinx_asm:
; CHECK: # %bb.0: # %entry
Expand Down
1 change: 1 addition & 0 deletions llvm/test/MC/AMDGPU/flat-scratch-gfx940.s
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck -check-prefix=GFX940 %s
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck -check-prefix=GFX940 %s

scratch_load_dword a2, v4, s6
// GFX940: scratch_load_dword a2, v4, s6 ; encoding: [0x00,0x60,0x50,0xdc,0x04,0x00,0x86,0x02]
Expand Down
1 change: 1 addition & 0 deletions llvm/test/MC/AMDGPU/gfx940_asm_features.s
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX940 --strict-whitespace %s
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX90A --implicit-check-not=error: %s
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=NOT-GFX940,GFX10 --implicit-check-not=error: %s

Expand Down
179 changes: 179 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950-unsupported.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck -check-prefix=ERR %s

//===----------------------------------------------------------------------===//
// v_mfma_f32_32x32x4_xf32
//===----------------------------------------------------------------------===//

v_mfma_f32_32x32x4_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], v[0:3], 1.0
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], a[0:3], 1.0
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], v[0:3], a[4:7]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], a[0:3], v[4:7]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], v[0:3], 1.0
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], a[0:3], 1.0
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 a[0:3], v[0:3], v[0:3], a[4:7]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_32x32x4_xf32 v[0:3], a[0:3], a[0:3], v[4:7]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU


//===----------------------------------------------------------------------===//
// v_mfma_f32_16x16x8_xf32
//===----------------------------------------------------------------------===//

v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU


v_mfma_f32_16x16x8_xf32 a[0:3], v[2:3], v[4:5], a[2:5]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], v[0:3], 1.0
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], a[0:3], 1.0
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], v[0:3], v[0:3], v[0:3] blgp:5
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] blgp:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] abid:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], a[0:3], a[0:3], a[0:3] cbsz:3 abid:1
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 a[0:3], v[0:3], v[0:3], a[4:7]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU

v_mfma_f32_16x16x8_xf32 v[0:3], a[0:3], a[0:3], v[4:7]
// ERR: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
13 changes: 13 additions & 0 deletions llvm/test/MC/AMDGPU/gfx950_invalid_encoding.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# RUN: llvm-mc -disassemble -arch=amdgcn -mcpu=gfx950 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX950 %s

# GFX950: warning: invalid instruction encoding
0x00,0x80,0xbe,0xd3,0x02,0x09,0x0a,0x04

# GFX950: warning: invalid instruction encoding
0x00,0x00,0xbe,0xd3,0x02,0x09,0x0a,0x04

# GFX950: warning: invalid instruction encoding
0x00,0x00,0xbf,0xd3,0x02,0x09,0x0a,0x04

# GFX950: warning: invalid instruction encoding
0x00,0x80,0xbf,0xd3,0x02,0x09,0x0a,0x04
1 change: 1 addition & 0 deletions llvm/test/MC/Disassembler/AMDGPU/gfx940_features.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx940 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -disassemble -show-encoding %s | FileCheck -strict-whitespace --check-prefix=GFX940 %s

# GFX940: global_load_dword v2, v[2:3], off sc0 ; encoding: [0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02]
0x00,0x80,0x51,0xdc,0x02,0x00,0x7f,0x02
Expand Down
7 changes: 7 additions & 0 deletions llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,10 @@
# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX942 %s
# RUN: obj2yaml %t.o.AMDGCN_GFX942 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX942 %s

# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX950/' %s | yaml2obj -o %t.o.AMDGCN_GFX950
# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX950 %s
# RUN: obj2yaml %t.o.AMDGCN_GFX950 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX950 %s

# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX1010/' %s | yaml2obj -o %t.o.AMDGCN_GFX1010
# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1010 %s
# RUN: obj2yaml %t.o.AMDGCN_GFX1010 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1010 %s
Expand Down Expand Up @@ -411,6 +415,9 @@
# ELF-AMDGCN-GFX942: EF_AMDGPU_MACH_AMDGCN_GFX942 (0x4C)
# YAML-AMDGCN-GFX942: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX942 ]

# ELF-AMDGCN-GFX950: EF_AMDGPU_MACH_AMDGCN_GFX950 (0x4F)
# YAML-AMDGCN-GFX950: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX950 ]

# ELF-AMDGCN-GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
# YAML-AMDGCN-GFX1010: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1010 ]

Expand Down
Loading