Skip to content

Commit

Permalink
[OpenMP][AMDGPU] More detail in AMDGPU kernel launch info
Browse files Browse the repository at this point in the history
Makes the info that is printed for kernel launches configurable for
different plugins. Adds all machinery to print the detailed launch
info that the current AMD plugin provides and includes e.g. register
spill counts.

The files msgpack.cpp, msgpack.def, and msgpack.h are copied from the old plugin
and are untouched. The contents of UtilitiesHSA.cpp and .h are copied together from
various files from the old plugin. The code was originally written by
Jon Chesterfield. I updated the function and type names visible to the outside, i.e.
in headers, to respect the LLVM conventions.

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D144521
  • Loading branch information
jplehr committed Feb 28, 2023
1 parent 0ec4cae commit b82ac74
Show file tree
Hide file tree
Showing 5 changed files with 294 additions and 18 deletions.
74 changes: 74 additions & 0 deletions openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,10 +373,22 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
Expected<hsa_executable_symbol_t>
findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;

/// Get additional info for kernel, e.g., register spill counts
std::optional<utils::KernelMetaDataTy>
getKernelInfo(StringRef Identifier) const {
auto It = KernelInfoMap.find(Identifier);

if (It == KernelInfoMap.end())
return {};

return It->second;
}

private:
/// The exectuable loaded on the agent.
hsa_executable_t Executable;
hsa_code_object_t CodeObject;
StringMap<utils::KernelMetaDataTy> KernelInfoMap;
};

/// Class implementing the AMDGPU kernel functionalities which derives from the
Expand Down Expand Up @@ -426,6 +438,12 @@ struct AMDGPUKernelTy : public GenericKernelTy {
// TODO: Read the kernel descriptor for the max threads per block. May be
// read from the image.

// Get additional kernel info read from image
KernelInfo = AMDImage.getKernelInfo(getName());
if (!KernelInfo.has_value())
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device.getDeviceId(),
"Could not read extra information for kernel %s.", getName());

return Plugin::success();
}

Expand All @@ -434,6 +452,11 @@ struct AMDGPUKernelTy : public GenericKernelTy {
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

/// Print more elaborate kernel launch info for AMDGPU
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads,
uint64_t NumBlocks) const override;

/// The default number of blocks is common to the whole device.
uint32_t getDefaultNumBlocks(GenericDeviceTy &GenericDevice) const override {
return GenericDevice.getDefaultNumBlocks();
Expand Down Expand Up @@ -462,6 +485,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {

/// The size of implicit kernel arguments.
const uint32_t ImplicitArgsSize;

/// Additional Info for the AMD GPU Kernel
std::optional<utils::KernelMetaDataTy> KernelInfo;
};

/// Class representing an HSA signal. Signals are used to define dependencies
Expand Down Expand Up @@ -2200,6 +2226,10 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
if (Result)
return Plugin::error("Loaded HSA executable does not validate");

if (auto Err =
utils::readAMDGPUMetaDataFromImage(getMemoryBuffer(), KernelInfoMap))
return Err;

return Plugin::success();
}

Expand Down Expand Up @@ -2571,6 +2601,50 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
GroupSize, ArgsMemoryManager);
}

Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const {
// Only do all this when the output is requested
if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL))
return Plugin::success();

// We don't have data to print additional info, but no hard error
if (!KernelInfo.has_value())
return Plugin::success();

// General Info
auto ConstWGSize = getDefaultNumThreads(GenericDevice);
auto NumGroups = NumBlocks;
auto ThreadsPerGroup = getDefaultNumThreads(GenericDevice);
auto NumTeams = KernelArgs.NumTeams[0]; // Only first dimension
auto ThreadLimit = KernelArgs.ThreadLimit[0]; // Only first dimension

// Kernel Arguments Info
auto ArgNum = KernelArgs.NumArgs;
auto LoopTripCount = KernelArgs.Tripcount;

// Details for AMDGPU kernels
auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
auto SGPRCount = (*KernelInfo).SGPRCount;
auto VGPRCount = (*KernelInfo).VGPRCount;
auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;

// TODO set correctly once host services available
auto HostCallRequired = false;
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
"SGN:%s ConstWGSize:%d args:%d teamsXthrds:(%4dX%4d) "
"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d n:%s\n",
getExecutionModeName(), ConstWGSize, ArgNum, NumGroups, ThreadsPerGroup,
NumTeams, ThreadLimit, GroupSegmentSize, SGPRCount, VGPRCount,
SGPRSpillCount, VGPRSpillCount, LoopTripCount, HostCallRequired,
getName());

return Plugin::success();
}

GenericPluginTy *Plugin::createPlugin() { return new AMDGPUPluginTy(); }

GenericDeviceTy *Plugin::createDevice(int32_t DeviceId, int32_t NumDevices) {
Expand Down
168 changes: 168 additions & 0 deletions openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@

#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Error.h"

#include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/MemoryBufferRef.h"

#include "llvm/Support/YAMLTraits.h"

namespace llvm {
namespace omp {
Expand Down Expand Up @@ -127,6 +135,166 @@ bool isImageCompatibleWithEnv(const __tgt_image_info *Info,
return true;
}

struct KernelMetaDataTy {
uint64_t KernelObject;
uint32_t GroupSegmentList;
uint32_t PrivateSegmentSize;
uint32_t SGPRCount;
uint32_t VGPRCount;
uint32_t SGPRSpillCount;
uint32_t VGPRSpillCount;
uint32_t KernelSegmentSize;
uint32_t ExplicitArgumentCount;
uint32_t ImplicitArgumentCount;
};
namespace {

/// Reads the AMDGPU specific per-kernel-metadata from an image.
class KernelInfoReader {
public:
KernelInfoReader(StringMap<KernelMetaDataTy> &KIM) : KernelInfoMap(KIM) {}

/// Process ELF note to read AMDGPU metadata from respective information
/// fields.
Error processNote(const object::ELF64LE::Note &Note) {
if (Note.getName() != "AMDGPU")
return Error::success(); // We are not interested in other things

assert(Note.getType() == ELF::NT_AMDGPU_METADATA &&
"Parse AMDGPU MetaData");
auto Desc = Note.getDesc();
StringRef MsgPackString =
StringRef(reinterpret_cast<const char *>(Desc.data()), Desc.size());
msgpack::Document MsgPackDoc;
if (!MsgPackDoc.readFromBlob(MsgPackString, /*Multi=*/false))
return Error::success();

AMDGPU::HSAMD::V3::MetadataVerifier Verifier(true);
if (!Verifier.verify(MsgPackDoc.getRoot()))
return Error::success();

auto RootMap = MsgPackDoc.getRoot().getMap(true);

if (auto Err = iterateAMDKernels(RootMap))
return Err;

return Error::success();
}

private:
/// Extracts the relevant information via simple string look-up in the msgpack
/// document elements.
Error extractKernelData(msgpack::MapDocNode::MapTy::value_type V,
std::string &KernelName,
KernelMetaDataTy &KernelData) {
if (!V.first.isString())
return Error::success();

const auto isKey = [](const msgpack::DocNode &DK, StringRef SK) {
return DK.getString() == SK;
};

if (isKey(V.first, ".name")) {
KernelName = V.second.toString();
} else if (isKey(V.first, ".sgpr_count")) {
KernelData.SGPRCount = V.second.getUInt();
} else if (isKey(V.first, ".sgpr_spill_count")) {
KernelData.SGPRSpillCount = V.second.getUInt();
} else if (isKey(V.first, ".vgpr_count")) {
KernelData.VGPRCount = V.second.getUInt();
} else if (isKey(V.first, ".vgpr_spill_count")) {
KernelData.VGPRSpillCount = V.second.getUInt();
} else if (isKey(V.first, ".private_segment_fixed_size")) {
KernelData.PrivateSegmentSize = V.second.getUInt();
} else if (isKey(V.first, ".group_segement_fixed_size")) {
KernelData.GroupSegmentList = V.second.getUInt();
}

return Error::success();
}

/// Get the "amdhsa.kernels" element from the msgpack Document
Expected<msgpack::ArrayDocNode> getAMDKernelsArray(msgpack::MapDocNode &MDN) {
auto Res = MDN.find("amdhsa.kernels");
if (Res == MDN.end())
return createStringError(inconvertibleErrorCode(),
"Could not find amdhsa.kernels key");

auto Pair = *Res;
assert(Pair.second.isArray() &&
"AMDGPU kernel entries are arrays of entries");

return Pair.second.getArray();
}

/// Iterate all entries for one "amdhsa.kernels" entry. Each entry is a
/// MapDocNode that either maps a string to a single value (most of them) or
/// to another array of things. Currently, we only handle the case that maps
/// to scalar value.
Error generateKernelInfo(msgpack::ArrayDocNode::ArrayTy::iterator It) {
KernelMetaDataTy KernelData;
std::string KernelName;
auto Entry = (*It).getMap();
for (auto MI = Entry.begin(), E = Entry.end(); MI != E; ++MI)
if (auto Err = extractKernelData(*MI, KernelName, KernelData))
return Err;

KernelInfoMap.insert({KernelName, KernelData});
return Error::success();
}

/// Go over the list of AMD kernels in the "amdhsa.kernels" entry
Error iterateAMDKernels(msgpack::MapDocNode &MDN) {
auto KernelsOrErr = getAMDKernelsArray(MDN);
if (auto Err = KernelsOrErr.takeError())
return Err;

auto KernelsArr = *KernelsOrErr;
for (auto It = KernelsArr.begin(), E = KernelsArr.end(); It != E; ++It) {
if (!It->isMap())
continue; // we expect <key,value> pairs

// Obtain the value for the different entries. Each array entry is a
// MapDocNode
if (auto Err = generateKernelInfo(It))
return Err;
}
return Error::success();
}

// Kernel names are the keys
StringMap<KernelMetaDataTy> &KernelInfoMap;
};
} // namespace

/// Reads the AMDGPU specific metadata from the ELF file and propagates the
/// KernelInfoMap
Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
StringMap<KernelMetaDataTy> &KernelInfoMap) {
Error Err = Error::success(); // Used later as out-parameter

auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
if (auto Err = ELFOrError.takeError())
return Err;

const object::ELF64LEFile ELFObj = ELFOrError.get();
ArrayRef<object::ELF64LE::Shdr> Sections = cantFail(ELFObj.sections());
KernelInfoReader Reader(KernelInfoMap);
for (const auto &S : Sections) {
if (S.sh_type != ELF::SHT_NOTE)
continue;

for (const auto N : ELFObj.notes(S, Err)) {
if (Err)
return Err;
// Fills the KernelInfoTabel entries in the reader
if ((Err = Reader.processNote(N)))
return Err;
}
}

return Error::success();
}
} // namespace utils
} // namespace plugin
} // namespace target
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,25 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
return initImpl(GenericDevice, Image);
}

Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const {
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
"Launching kernel %s with %" PRIu64
" blocks and %d threads in %s mode\n",
getName(), NumBlocks, NumThreads, getExecutionModeName());
return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads,
NumBlocks);
}

Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const {
return Plugin::success();
}

Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
ptrdiff_t *ArgOffsets, KernelArgsTy &KernelArgs,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
Expand All @@ -232,10 +251,9 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
uint64_t NumBlocks = getNumBlocks(GenericDevice, KernelArgs.NumTeams,
KernelArgs.Tripcount, NumThreads);

INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
"Launching kernel %s with %" PRIu64
" blocks and %d threads in %s mode\n",
getName(), NumBlocks, NumThreads, getExecutionModeName());
if (auto Err =
printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks))
return Err;

return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
KernelArgsPtr, AsyncInfoWrapper);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,32 @@ struct GenericKernelTy {
return false;
}

protected:
/// Get the execution mode name of the kernel.
const char *getExecutionModeName() const {
switch (ExecutionMode) {
case OMP_TGT_EXEC_MODE_SPMD:
return "SPMD";
case OMP_TGT_EXEC_MODE_GENERIC:
return "Generic";
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return "Generic-SPMD";
}
llvm_unreachable("Unknown execution mode!");
}

/// Prints generic kernel launch information.
Error printLaunchInfo(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads,
uint64_t NumBlocks) const;

/// Prints plugin-specific kernel launch information after generic kernel
/// launch information
virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs,
uint32_t NumThreads,
uint64_t NumBlocks) const;

private:
/// Prepare the arguments before launching the kernel.
void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
Expand Down Expand Up @@ -225,19 +251,6 @@ struct GenericKernelTy {
}
bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; }

/// Get the execution mode name of the kernel.
const char *getExecutionModeName() const {
switch (ExecutionMode) {
case OMP_TGT_EXEC_MODE_SPMD:
return "SPMD";
case OMP_TGT_EXEC_MODE_GENERIC:
return "Generic";
case OMP_TGT_EXEC_MODE_GENERIC_SPMD:
return "Generic-SPMD";
}
llvm_unreachable("Unknown execution mode!");
}

/// The kernel name.
const char *Name;

Expand Down
Loading

0 comments on commit b82ac74

Please sign in to comment.