Skip to content

Commit

Permalink
[CUDA][HIP] Improve variable registration with the new driver (#73177)
Browse files Browse the repository at this point in the history
Summary:
This patch adds support for registering texture / surface variables from
CUDA / HIP. Additionally, we now properly track the `extern` and `const`
flags that are also used in these runtime functions.

This does not implement the `managed` variables yet as those seem to
require some extra handling I'm not familiar with. The issue is that the
current offload entry isn't large enough to carry size and alignment
information along with an extra global.
  • Loading branch information
jhuber6 committed Dec 7, 2023
1 parent fb35bb4 commit 97f3be2
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 101 deletions.
27 changes: 20 additions & 7 deletions clang/lib/CodeGen/CGCUDANV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1132,26 +1132,39 @@ void CGNVCUDARuntime::createOffloadingEntries() {
for (KernelInfo &I : EmittedKernels)
llvm::offloading::emitOffloadingEntry(
M, KernelHandles[I.Kernel->getName()],
getDeviceSideName(cast<NamedDecl>(I.D)), 0,
DeviceVarFlags::OffloadGlobalEntry, Section);
getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
llvm::offloading::OffloadGlobalEntry, Section);

for (VarInfo &I : DeviceVars) {
uint64_t VarSize =
CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
int32_t Flags =
(I.Flags.isExtern()
? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
: 0) |
(I.Flags.isConstant()
? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
: 0) |
(I.Flags.isNormalized()
? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
: 0);
if (I.Flags.getKind() == DeviceVarFlags::Variable) {
llvm::offloading::emitOffloadingEntry(
M, I.Var, getDeviceSideName(I.D), VarSize,
I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry
: DeviceVarFlags::OffloadGlobalEntry,
Section);
(I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
: llvm::offloading::OffloadGlobalEntry) |
Flags,
/*Data=*/0, Section);
} else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
llvm::offloading::emitOffloadingEntry(
M, I.Var, getDeviceSideName(I.D), VarSize,
DeviceVarFlags::OffloadGlobalSurfaceEntry, Section);
llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
I.Flags.getSurfTexType(), Section);
} else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
llvm::offloading::emitOffloadingEntry(
M, I.Var, getDeviceSideName(I.D), VarSize,
DeviceVarFlags::OffloadGlobalTextureEntry, Section);
llvm::offloading::OffloadGlobalTextureEntry | Flags,
I.Flags.getSurfTexType(), Section);
}
}
}
Expand Down
14 changes: 1 addition & 13 deletions clang/lib/CodeGen/CGCUDARuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "clang/AST/GlobalDecl.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Frontend/Offloading/Utility.h"
#include "llvm/IR/GlobalValue.h"

namespace llvm {
Expand Down Expand Up @@ -52,19 +53,6 @@ class CGCUDARuntime {
Texture, // Builtin texture
};

/// The kind flag for an offloading entry.
enum OffloadEntryKindFlag : uint32_t {
/// Mark the entry as a global entry. This indicates the presense of a
/// kernel if the size field is zero and a variable otherwise.
OffloadGlobalEntry = 0x0,
/// Mark the entry as a managed global variable.
OffloadGlobalManagedEntry = 0x1,
/// Mark the entry as a surface variable.
OffloadGlobalSurfaceEntry = 0x2,
/// Mark the entry as a texture variable.
OffloadGlobalTextureEntry = 0x3,
};

private:
unsigned Kind : 2;
unsigned Extern : 1;
Expand Down
86 changes: 60 additions & 26 deletions clang/test/CodeGenCUDA/offloading-entries.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,47 @@
//.
// CUDA: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
// CUDA: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
// CUDA: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
// CUDA: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
// CUDA: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
// CUDA: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
// CUDA: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
// CUDA: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
// CUDA: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
// CUDA: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
// CUDA: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
// CUDA: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries", align 1
// CUDA: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
// CUDA: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries", align 1
//.
// HIP: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
// HIP: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
// HIP: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
// HIP: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
// HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
// HIP: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1
// HIP: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
// HIP: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
// HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
// HIP: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1
// HIP: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
// HIP: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "hip_offloading_entries", align 1
// HIP: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
// HIP: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "hip_offloading_entries", align 1
//.
// CUDA-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
// CUDA-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
// CUDA-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
// CUDA-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
// CUDA-COFF: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
// CUDA-COFF: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
// CUDA-COFF: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
// CUDA-COFF: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries$OE", align 1
// CUDA-COFF: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
// CUDA-COFF: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries$OE", align 1
//.
// HIP-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00"
// HIP-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00"
// HIP-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00"
// HIP-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00"
// HIP-COFF: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00"
// HIP-COFF: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
// HIP-COFF: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00"
// HIP-COFF: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "hip_offloading_entries$OE", align 1
// HIP-COFF: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00"
// HIP-COFF: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "hip_offloading_entries$OE", align 1
//.
// CUDA-LABEL: @_Z18__device_stub__foov(
// CUDA-NEXT: entry:
Expand Down Expand Up @@ -72,34 +88,52 @@
// HIP-COFF-NEXT: ret void
//
__global__ void foo() {}
__device__ int var = 1;
const __device__ int constant = 1;
extern __device__ int external;

// CUDA-LABEL: @_Z18__device_stub__barv(
// CUDA-LABEL: @_Z21__device_stub__kernelv(
// CUDA-NEXT: entry:
// CUDA-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
// CUDA-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z21__device_stub__kernelv)
// CUDA-NEXT: br label [[SETUP_END:%.*]]
// CUDA: setup.end:
// CUDA-NEXT: ret void
//
// HIP-LABEL: @_Z18__device_stub__barv(
// HIP-LABEL: @_Z21__device_stub__kernelv(
// HIP-NEXT: entry:
// HIP-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv)
// HIP-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z6kernelv)
// HIP-NEXT: br label [[SETUP_END:%.*]]
// HIP: setup.end:
// HIP-NEXT: ret void
//
// CUDA-COFF-LABEL: @_Z18__device_stub__barv(
// CUDA-COFF-LABEL: @_Z21__device_stub__kernelv(
// CUDA-COFF-NEXT: entry:
// CUDA-COFF-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv)
// CUDA-COFF-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z21__device_stub__kernelv)
// CUDA-COFF-NEXT: br label [[SETUP_END:%.*]]
// CUDA-COFF: setup.end:
// CUDA-COFF-NEXT: ret void
//
// HIP-COFF-LABEL: @_Z18__device_stub__barv(
// HIP-COFF-LABEL: @_Z21__device_stub__kernelv(
// HIP-COFF-NEXT: entry:
// HIP-COFF-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv)
// HIP-COFF-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z6kernelv)
// HIP-COFF-NEXT: br label [[SETUP_END:%.*]]
// HIP-COFF: setup.end:
// HIP-COFF-NEXT: ret void
//
__global__ void bar() {}
__device__ int x = 1;
__global__ void kernel() { external = 1; }

struct surfaceReference { int desc; };

template <typename T, int dim = 1>
struct __attribute__((device_builtin_surface_type)) surface : public surfaceReference {};

surface<void> surf;

struct textureReference {
int desc;
};

template <typename T, int dim = 1, int mode = 0>
struct __attribute__((device_builtin_texture_type)) texture : public textureReference {};

texture<void> tex;

0 comments on commit 97f3be2

Please sign in to comment.