diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 66147f656071f..520b0c4f11767 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -1132,26 +1132,39 @@ void CGNVCUDARuntime::createOffloadingEntries() { for (KernelInfo &I : EmittedKernels) llvm::offloading::emitOffloadingEntry( M, KernelHandles[I.Kernel->getName()], - getDeviceSideName(cast(I.D)), 0, - DeviceVarFlags::OffloadGlobalEntry, Section); + getDeviceSideName(cast(I.D)), /*Flags=*/0, /*Data=*/0, + llvm::offloading::OffloadGlobalEntry, Section); for (VarInfo &I : DeviceVars) { uint64_t VarSize = CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType()); + int32_t Flags = + (I.Flags.isExtern() + ? static_cast(llvm::offloading::OffloadGlobalExtern) + : 0) | + (I.Flags.isConstant() + ? static_cast(llvm::offloading::OffloadGlobalConstant) + : 0) | + (I.Flags.isNormalized() + ? static_cast(llvm::offloading::OffloadGlobalNormalized) + : 0); if (I.Flags.getKind() == DeviceVarFlags::Variable) { llvm::offloading::emitOffloadingEntry( M, I.Var, getDeviceSideName(I.D), VarSize, - I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry - : DeviceVarFlags::OffloadGlobalEntry, - Section); + (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry + : llvm::offloading::OffloadGlobalEntry) | + Flags, + /*Data=*/0, Section); } else if (I.Flags.getKind() == DeviceVarFlags::Surface) { llvm::offloading::emitOffloadingEntry( M, I.Var, getDeviceSideName(I.D), VarSize, - DeviceVarFlags::OffloadGlobalSurfaceEntry, Section); + llvm::offloading::OffloadGlobalSurfaceEntry | Flags, + I.Flags.getSurfTexType(), Section); } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { llvm::offloading::emitOffloadingEntry( M, I.Var, getDeviceSideName(I.D), VarSize, - DeviceVarFlags::OffloadGlobalTextureEntry, Section); + llvm::offloading::OffloadGlobalTextureEntry | Flags, + I.Flags.getSurfTexType(), Section); } } } diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h index 9a9c6d26cc63c..c7af8f1cf0fe9 100644 --- a/clang/lib/CodeGen/CGCUDARuntime.h +++ b/clang/lib/CodeGen/CGCUDARuntime.h @@ -17,6 +17,7 @@ #include "clang/AST/GlobalDecl.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/GlobalValue.h" namespace llvm { @@ -52,19 +53,6 @@ class CGCUDARuntime { Texture, // Builtin texture }; - /// The kind flag for an offloading entry. - enum OffloadEntryKindFlag : uint32_t { - /// Mark the entry as a global entry. This indicates the presense of a - /// kernel if the size field is zero and a variable otherwise. - OffloadGlobalEntry = 0x0, - /// Mark the entry as a managed global variable. - OffloadGlobalManagedEntry = 0x1, - /// Mark the entry as a surface variable. - OffloadGlobalSurfaceEntry = 0x2, - /// Mark the entry as a texture variable. - OffloadGlobalTextureEntry = 0x3, - }; - private: unsigned Kind : 2; unsigned Extern : 1; diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu b/clang/test/CodeGenCUDA/offloading-entries.cu index 46235051f1e4f..4f5cf65ecd0bd 100644 --- a/clang/test/CodeGenCUDA/offloading-entries.cu +++ b/clang/test/CodeGenCUDA/offloading-entries.cu @@ -17,31 +17,47 @@ //. // CUDA: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00" // CUDA: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 -// CUDA: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00" -// CUDA: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 -// CUDA: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00" -// CUDA: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 +// CUDA: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00" +// CUDA: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 +// CUDA: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00" +// CUDA: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1 +// CUDA: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00" +// CUDA: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries", align 1 +// CUDA: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00" +// CUDA: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries", align 1 //. // HIP: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00" // HIP: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1 -// HIP: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00" -// HIP: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1 -// HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00" -// HIP: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1 +// HIP: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00" +// HIP: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1 +// HIP: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00" +// HIP: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1 +// HIP: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00" +// HIP: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "hip_offloading_entries", align 1 +// HIP: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00" +// HIP: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "hip_offloading_entries", align 1 //. // CUDA-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00" // CUDA-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1 -// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00" -// CUDA-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1 -// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00" -// CUDA-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1 +// CUDA-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00" +// CUDA-COFF: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1 +// CUDA-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00" +// CUDA-COFF: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1 +// CUDA-COFF: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00" +// CUDA-COFF: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries$OE", align 1 +// CUDA-COFF: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00" +// CUDA-COFF: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries$OE", align 1 //. // HIP-COFF: @.omp_offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00" // HIP-COFF: @.omp_offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.omp_offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1 -// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [8 x i8] c"_Z3barv\00" -// HIP-COFF: @.omp_offloading.entry._Z3barv = weak constant %struct.__tgt_offload_entry { ptr @_Z3barv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1 -// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [2 x i8] c"x\00" -// HIP-COFF: @.omp_offloading.entry.x = weak constant %struct.__tgt_offload_entry { ptr @x, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1 +// HIP-COFF: @.omp_offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00" +// HIP-COFF: @.omp_offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.omp_offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1 +// HIP-COFF: @.omp_offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00" +// HIP-COFF: @.omp_offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.omp_offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1 +// HIP-COFF: @.omp_offloading.entry_name.3 = internal unnamed_addr constant [5 x i8] c"surf\00" +// HIP-COFF: @.omp_offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.omp_offloading.entry_name.3, i64 4, i32 2, i32 1 }, section "hip_offloading_entries$OE", align 1 +// HIP-COFF: @.omp_offloading.entry_name.4 = internal unnamed_addr constant [4 x i8] c"tex\00" +// HIP-COFF: @.omp_offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.omp_offloading.entry_name.4, i64 4, i32 3, i32 1 }, section "hip_offloading_entries$OE", align 1 //. // CUDA-LABEL: @_Z18__device_stub__foov( // CUDA-NEXT: entry: @@ -72,34 +88,52 @@ // HIP-COFF-NEXT: ret void // __global__ void foo() {} +__device__ int var = 1; +const __device__ int constant = 1; +extern __device__ int external; -// CUDA-LABEL: @_Z18__device_stub__barv( +// CUDA-LABEL: @_Z21__device_stub__kernelv( // CUDA-NEXT: entry: -// CUDA-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv) +// CUDA-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z21__device_stub__kernelv) // CUDA-NEXT: br label [[SETUP_END:%.*]] // CUDA: setup.end: // CUDA-NEXT: ret void // -// HIP-LABEL: @_Z18__device_stub__barv( +// HIP-LABEL: @_Z21__device_stub__kernelv( // HIP-NEXT: entry: -// HIP-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv) +// HIP-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z6kernelv) // HIP-NEXT: br label [[SETUP_END:%.*]] // HIP: setup.end: // HIP-NEXT: ret void // -// CUDA-COFF-LABEL: @_Z18__device_stub__barv( +// CUDA-COFF-LABEL: @_Z21__device_stub__kernelv( // CUDA-COFF-NEXT: entry: -// CUDA-COFF-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z18__device_stub__barv) +// CUDA-COFF-NEXT: [[TMP0:%.*]] = call i32 @cudaLaunch(ptr @_Z21__device_stub__kernelv) // CUDA-COFF-NEXT: br label [[SETUP_END:%.*]] // CUDA-COFF: setup.end: // CUDA-COFF-NEXT: ret void // -// HIP-COFF-LABEL: @_Z18__device_stub__barv( +// HIP-COFF-LABEL: @_Z21__device_stub__kernelv( // HIP-COFF-NEXT: entry: -// HIP-COFF-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z3barv) +// HIP-COFF-NEXT: [[TMP0:%.*]] = call i32 @hipLaunchByPtr(ptr @_Z6kernelv) // HIP-COFF-NEXT: br label [[SETUP_END:%.*]] // HIP-COFF: setup.end: // HIP-COFF-NEXT: ret void // -__global__ void bar() {} -__device__ int x = 1; +__global__ void kernel() { external = 1; } + +struct surfaceReference { int desc; }; + +template +struct __attribute__((device_builtin_surface_type)) surface : public surfaceReference {}; + +surface surf; + +struct textureReference { + int desc; +}; + +template +struct __attribute__((device_builtin_texture_type)) texture : public textureReference {}; + +texture tex; diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c index 73d3c40810c35..4a17a8324b462 100644 --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -80,24 +80,33 @@ // CUDA-NEXT: br i1 icmp ne (ptr @__start_cuda_offloading_entries, ptr @__stop_cuda_offloading_entries), label %while.entry, label %while.end // CUDA: while.entry: -// CUDA-NEXT: %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %7, %if.end ] -// CUDA-NEXT: %1 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0 -// CUDA-NEXT: %addr = load ptr, ptr %1, align 8 -// CUDA-NEXT: %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1 -// CUDA-NEXT: %name = load ptr, ptr %2, align 8 -// CUDA-NEXT: %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2 -// CUDA-NEXT: %size = load i64, ptr %3, align 4 -// CUDA-NEXT: %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3 -// CUDA-NEXT: %flag = load i32, ptr %4, align 4 -// CUDA-NEXT: %5 = icmp eq i64 %size, 0 -// CUDA-NEXT: br i1 %5, label %if.then, label %if.else +// CUDA-NEXT: %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %11, %if.end ] +// CUDA-NEXT: %1 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0 +// CUDA-NEXT: %addr = load ptr, ptr %1, align 8 +// CUDA-NEXT: %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1 +// CUDA-NEXT: %name = load ptr, ptr %2, align 8 +// CUDA-NEXT: %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2 +// CUDA-NEXT: %size = load i64, ptr %3, align 4 +// CUDA-NEXT: %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3 +// CUDA-NEXT: %flags = load i32, ptr %4, align 4 +// CUDA-NEXT: %5 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 4 +// CUDA-NEXT: %textype = load i32, ptr %4, align 4 +// CUDA-NEXT: %type = and i32 %flags, 7 +// CUDA-NEXT: %6 = and i32 %flags, 8 +// CUDA-NEXT: %extern = lshr i32 %6, 3 +// CUDA-NEXT: %7 = and i32 %flags, 16 +// CUDA-NEXT: %constant = lshr i32 %7, 4 +// CUDA-NEXT: %8 = and i32 %flags, 32 +// CUDA-NEXT: %normalized = lshr i32 %8, 5 +// CUDA-NEXT: %9 = icmp eq i64 %size, 0 +// CUDA-NEXT: br i1 %9, label %if.then, label %if.else // CUDA: if.then: -// CUDA-NEXT: %6 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) +// CUDA-NEXT: %10 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) // CUDA-NEXT: br label %if.end // CUDA: if.else: -// CUDA-NEXT: switch i32 %flag, label %if.end [ +// CUDA-NEXT: switch i32 %type, label %if.end [ // CUDA-NEXT: i32 0, label %sw.global // CUDA-NEXT: i32 1, label %sw.managed // CUDA-NEXT: i32 2, label %sw.surface @@ -105,22 +114,24 @@ // CUDA-NEXT: ] // CUDA: sw.global: -// CUDA-NEXT: call void @__cudaRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 0, i64 %size, i32 0, i32 0) +// CUDA-NEXT: call void @__cudaRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %extern, i64 %size, i32 %constant, i32 0) // CUDA-NEXT: br label %if.end // CUDA: sw.managed: // CUDA-NEXT: br label %if.end // CUDA: sw.surface: +// CUDA-NEXT: call void @__cudaRegisterSurface(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %extern) // CUDA-NEXT: br label %if.end // CUDA: sw.texture: +// CUDA-NEXT: call void @__cudaRegisterTexture(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %normalized, i32 %extern) // CUDA-NEXT: br label %if.end // CUDA: if.end: -// CUDA-NEXT: %7 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1 -// CUDA-NEXT: %8 = icmp eq ptr %7, @__stop_cuda_offloading_entries -// CUDA-NEXT: br i1 %8, label %while.end, label %while.entry +// CUDA-NEXT: %11 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1 +// CUDA-NEXT: %12 = icmp eq ptr %11, @__stop_cuda_offloading_entries +// CUDA-NEXT: br i1 %12, label %while.end, label %while.entry // CUDA: while.end: // CUDA-NEXT: ret void @@ -168,7 +179,7 @@ // HIP-NEXT: br i1 icmp ne (ptr @__start_hip_offloading_entries, ptr @__stop_hip_offloading_entries), label %while.entry, label %while.end // HIP: while.entry: -// HIP-NEXT: %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %7, %if.end ] +// HIP-NEXT: %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %11, %if.end ] // HIP-NEXT: %1 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0 // HIP-NEXT: %addr = load ptr, ptr %1, align 8 // HIP-NEXT: %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1 @@ -176,16 +187,25 @@ // HIP-NEXT: %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2 // HIP-NEXT: %size = load i64, ptr %3, align 4 // HIP-NEXT: %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3 -// HIP-NEXT: %flag = load i32, ptr %4, align 4 -// HIP-NEXT: %5 = icmp eq i64 %size, 0 -// HIP-NEXT: br i1 %5, label %if.then, label %if.else +// HIP-NEXT: %flags = load i32, ptr %4, align 4 +// HIP-NEXT: %5 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 4 +// HIP-NEXT: %textype = load i32, ptr %4, align 4 +// HIP-NEXT: %type = and i32 %flags, 7 +// HIP-NEXT: %6 = and i32 %flags, 8 +// HIP-NEXT: %extern = lshr i32 %6, 3 +// HIP-NEXT: %7 = and i32 %flags, 16 +// HIP-NEXT: %constant = lshr i32 %7, 4 +// HIP-NEXT: %8 = and i32 %flags, 32 +// HIP-NEXT: %normalized = lshr i32 %8, 5 +// HIP-NEXT: %9 = icmp eq i64 %size, 0 +// HIP-NEXT: br i1 %9, label %if.then, label %if.else // HIP: if.then: -// HIP-NEXT: %6 = call i32 @__hipRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) +// HIP-NEXT: %10 = call i32 @__hipRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) // HIP-NEXT: br label %if.end // HIP: if.else: -// HIP-NEXT: switch i32 %flag, label %if.end [ +// HIP-NEXT: switch i32 %type, label %if.end [ // HIP-NEXT: i32 0, label %sw.global // HIP-NEXT: i32 1, label %sw.managed // HIP-NEXT: i32 2, label %sw.surface @@ -193,22 +213,24 @@ // HIP-NEXT: ] // HIP: sw.global: -// HIP-NEXT: call void @__hipRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 0, i64 %size, i32 0, i32 0) +// HIP-NEXT: call void @__hipRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %extern, i64 %size, i32 %constant, i32 0) // HIP-NEXT: br label %if.end // HIP: sw.managed: // HIP-NEXT: br label %if.end // HIP: sw.surface: +// HIP-NEXT: call void @__hipRegisterSurface(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %extern) // HIP-NEXT: br label %if.end // HIP: sw.texture: +// HIP-NEXT: call void @__hipRegisterTexture(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %normalized, i32 %extern) // HIP-NEXT: br label %if.end // HIP: if.end: -// HIP-NEXT: %7 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1 -// HIP-NEXT: %8 = icmp eq ptr %7, @__stop_hip_offloading_entries -// HIP-NEXT: br i1 %8, label %while.end, label %while.entry +// HIP-NEXT: %11 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1 +// HIP-NEXT: %12 = icmp eq ptr %11, @__stop_hip_offloading_entries +// HIP-NEXT: br i1 %12, label %while.end, label %while.entry // HIP: while.end: // HIP-NEXT: ret void diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp index 3e1dd874216cc..58d9e1e85ceff 100644 --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -26,19 +26,6 @@ namespace { constexpr unsigned CudaFatMagic = 0x466243b1; constexpr unsigned HIPFatMagic = 0x48495046; -/// Copied from clang/CGCudaRuntime.h. -enum OffloadEntryKindFlag : uint32_t { - /// Mark the entry as a global entry. This indicates the presense of a - /// kernel if the size size field is zero and a variable otherwise. - OffloadGlobalEntry = 0x0, - /// Mark the entry as a managed global variable. - OffloadGlobalManagedEntry = 0x1, - /// Mark the entry as a surface variable. - OffloadGlobalSurfaceEntry = 0x2, - /// Mark the entry as a texture variable. - OffloadGlobalTextureEntry = 0x3, -}; - IntegerType *getSizeTTy(Module &M) { return M.getDataLayout().getIntPtrType(M.getContext()); } @@ -333,6 +320,24 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) { FunctionCallee RegVar = M.getOrInsertFunction( IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy); + // Get the __cudaRegisterSurface function declaration. + auto *RegSurfaceTy = + FunctionType::get(Type::getVoidTy(C), + {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, + Type::getInt32Ty(C), Type::getInt32Ty(C)}, + /*isVarArg=*/false); + FunctionCallee RegSurface = M.getOrInsertFunction( + IsHIP ? "__hipRegisterSurface" : "__cudaRegisterSurface", RegSurfaceTy); + + // Get the __cudaRegisterTexture function declaration. + auto *RegTextureTy = FunctionType::get( + Type::getVoidTy(C), + {Int8PtrPtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Type::getInt32Ty(C), + Type::getInt32Ty(C), Type::getInt32Ty(C)}, + /*isVarArg=*/false); + FunctionCallee RegTexture = M.getOrInsertFunction( + IsHIP ? "__hipRegisterTexture" : "__cudaRegisterTexture", RegTextureTy); + auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C), Int8PtrPtrTy, /*isVarArg*/ false); auto *RegGlobalsFn = @@ -375,7 +380,31 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) { Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, {ConstantInt::get(getSizeTTy(M), 0), ConstantInt::get(Type::getInt32Ty(C), 3)}); - auto *Flags = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "flag"); + auto *Flags = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "flags"); + auto *DataPtr = + Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry, + {ConstantInt::get(getSizeTTy(M), 0), + ConstantInt::get(Type::getInt32Ty(C), 4)}); + auto *Data = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "textype"); + auto *Kind = Builder.CreateAnd( + Flags, ConstantInt::get(Type::getInt32Ty(C), 0x7), "type"); + + // Extract the flags stored in the bit-field and convert them to C booleans. + auto *ExternBit = Builder.CreateAnd( + Flags, ConstantInt::get(Type::getInt32Ty(C), + llvm::offloading::OffloadGlobalExtern)); + auto *Extern = Builder.CreateLShr( + ExternBit, ConstantInt::get(Type::getInt32Ty(C), 3), "extern"); + auto *ConstantBit = Builder.CreateAnd( + Flags, ConstantInt::get(Type::getInt32Ty(C), + llvm::offloading::OffloadGlobalConstant)); + auto *Const = Builder.CreateLShr( + ConstantBit, ConstantInt::get(Type::getInt32Ty(C), 4), "constant"); + auto *NormalizedBit = Builder.CreateAnd( + Flags, ConstantInt::get(Type::getInt32Ty(C), + llvm::offloading::OffloadGlobalNormalized)); + auto *Normalized = Builder.CreateLShr( + NormalizedBit, ConstantInt::get(Type::getInt32Ty(C), 5), "normalized"); auto *FnCond = Builder.CreateICmpEQ(Size, ConstantInt::getNullValue(getSizeTTy(M))); Builder.CreateCondBr(FnCond, IfThenBB, IfElseBB); @@ -392,30 +421,37 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) { Builder.CreateBr(IfEndBB); Builder.SetInsertPoint(IfElseBB); - auto *Switch = Builder.CreateSwitch(Flags, IfEndBB); + auto *Switch = Builder.CreateSwitch(Kind, IfEndBB); // Create global variable registration code. Builder.SetInsertPoint(SwGlobalBB); - Builder.CreateCall(RegVar, {RegGlobalsFn->arg_begin(), Addr, Name, Name, - ConstantInt::get(Type::getInt32Ty(C), 0), Size, - ConstantInt::get(Type::getInt32Ty(C), 0), - ConstantInt::get(Type::getInt32Ty(C), 0)}); + Builder.CreateCall(RegVar, + {RegGlobalsFn->arg_begin(), Addr, Name, Name, Extern, Size, + Const, ConstantInt::get(Type::getInt32Ty(C), 0)}); Builder.CreateBr(IfEndBB); - Switch->addCase(Builder.getInt32(OffloadGlobalEntry), SwGlobalBB); + Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalEntry), + SwGlobalBB); // Create managed variable registration code. Builder.SetInsertPoint(SwManagedBB); Builder.CreateBr(IfEndBB); - Switch->addCase(Builder.getInt32(OffloadGlobalManagedEntry), SwManagedBB); + Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalManagedEntry), + SwManagedBB); // Create surface variable registration code. Builder.SetInsertPoint(SwSurfaceBB); + Builder.CreateCall( + RegSurface, {RegGlobalsFn->arg_begin(), Addr, Name, Name, Data, Extern}); Builder.CreateBr(IfEndBB); - Switch->addCase(Builder.getInt32(OffloadGlobalSurfaceEntry), SwSurfaceBB); + Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalSurfaceEntry), + SwSurfaceBB); // Create texture variable registration code. Builder.SetInsertPoint(SwTextureBB); + Builder.CreateCall(RegTexture, {RegGlobalsFn->arg_begin(), Addr, Name, Name, + Data, Normalized, Extern}); Builder.CreateBr(IfEndBB); - Switch->addCase(Builder.getInt32(OffloadGlobalTextureEntry), SwTextureBB); + Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalTextureEntry), + SwTextureBB); Builder.SetInsertPoint(IfEndBB); auto *NewEntry = Builder.CreateInBoundsGEP( diff --git a/llvm/include/llvm/Frontend/Offloading/Utility.h b/llvm/include/llvm/Frontend/Offloading/Utility.h index 631d5a5a3db68..520c192996a06 100644 --- a/llvm/include/llvm/Frontend/Offloading/Utility.h +++ b/llvm/include/llvm/Frontend/Offloading/Utility.h @@ -6,12 +6,35 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_FRONTEND_OFFLOADING_UTILITY_H +#define LLVM_FRONTEND_OFFLOADING_UTILITY_H + #include "llvm/IR/Module.h" #include "llvm/Object/OffloadBinary.h" namespace llvm { namespace offloading { +/// Offloading entry flags for CUDA / HIP. The first three bits indicate the +/// type of entry while the others are a bit field for additional information. +enum OffloadEntryKindFlag : uint32_t { + /// Mark the entry as a global entry. This indicates the presense of a + /// kernel if the size size field is zero and a variable otherwise. + OffloadGlobalEntry = 0x0, + /// Mark the entry as a managed global variable. + OffloadGlobalManagedEntry = 0x1, + /// Mark the entry as a surface variable. + OffloadGlobalSurfaceEntry = 0x2, + /// Mark the entry as a texture variable. + OffloadGlobalTextureEntry = 0x3, + /// Mark the entry as being extern. + OffloadGlobalExtern = 0x1 << 3, + /// Mark the entry as being constant. + OffloadGlobalConstant = 0x1 << 4, + /// Mark the entry as being a normalized surface. + OffloadGlobalNormalized = 0x1 << 5, +}; + /// Returns the type of the offloading entry we use to store kernels and /// globals that will be registered with the offloading runtime. StructType *getEntryTy(Module &M); @@ -25,7 +48,7 @@ StructType *getEntryTy(Module &M); /// char *name; // Name of the function or global. /// size_t size; // Size of the entry info (0 if it a function). /// int32_t flags; -/// int32_t reserved; +/// int32_t data; /// }; /// /// \param M The module to be used @@ -33,9 +56,11 @@ StructType *getEntryTy(Module &M); /// \param Name The symbol name associated with the global. /// \param Size The size in bytes of the global (0 for functions). /// \param Flags Flags associated with the entry. +/// \param Data Extra data storage associated with the entry. /// \param SectionName The section this entry will be placed at. void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, - uint64_t Size, int32_t Flags, StringRef SectionName); + uint64_t Size, int32_t Flags, int32_t Data, + StringRef SectionName); /// Creates a pair of globals used to iterate the array of offloading entries by /// accessing the section variables provided by the linker. @@ -44,3 +69,5 @@ getOffloadEntryArray(Module &M, StringRef SectionName); } // namespace offloading } // namespace llvm + +#endif // LLVM_FRONTEND_OFFLOADING_UTILITY_H diff --git a/llvm/lib/Frontend/Offloading/Utility.cpp b/llvm/lib/Frontend/Offloading/Utility.cpp index 1c08f02c17f5f..25f609517ebeb 100644 --- a/llvm/lib/Frontend/Offloading/Utility.cpp +++ b/llvm/lib/Frontend/Offloading/Utility.cpp @@ -29,7 +29,7 @@ StructType *offloading::getEntryTy(Module &M) { // TODO: Rework this interface to be more generic. void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, - uint64_t Size, int32_t Flags, + uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName) { llvm::Triple Triple(M.getTargetTriple()); @@ -51,7 +51,7 @@ void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, ConstantExpr::getPointerBitCastOrAddrSpaceCast(Str, Int8PtrTy), ConstantInt::get(SizeTy, Size), ConstantInt::get(Int32Ty, Flags), - ConstantInt::get(Int32Ty, 0), + ConstantInt::get(Int32Ty, Data), }; Constant *EntryInitializer = ConstantStruct::get(getEntryTy(M), EntryData); diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 690d6cbaa67b3..c354baa854b78 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -6165,7 +6165,7 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr, StringRef Name) { if (!Config.isGPU()) { llvm::offloading::emitOffloadingEntry( - M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, + M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0, "omp_offloading_entries"); return; }