Skip to content

Commit

Permalink
[clang][CodeGen] Fix GPU-specific attributes being dropped by bitcode…
Browse files Browse the repository at this point in the history
… linking

Device libs make use of patterns like this:
```
__attribute__((target("gfx11-insts")))
static unsigned do_intrin_stuff(void)
{
  return __builtin_amdgcn_s_sendmsg_rtnl(0x0);
}
```
For functions that are assumed to be eliminated if the currennt GPU target doesn't support them.
At O0 such functions aren't eliminated by common optimizations but often by AMDGPURemoveIncompatibleFunctions instead, which sees the "+gfx11-insts" attribute on, say, GFX9 and knows it's not valid, so it removes the function.

D142907 accidentally made it so such attributes were dropped during bitcode linking, making it impossible for RemoveIncompatibleFunctions to catch the functions and causing ISel to catch fire eventually.

This fixes the issue and adds a new test to ensure we don't accidentally fall into this trap again.

Fixes SWDEV-403642

Reviewed By: arsenm, yaxunl

Differential Revision: https://reviews.llvm.org/D152251
  • Loading branch information
Pierre-vh committed Jun 7, 2023
1 parent dcc8f94 commit 23431b5
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 14 deletions.
3 changes: 2 additions & 1 deletion clang/lib/CodeGen/CGCall.cpp
Expand Up @@ -2025,7 +2025,8 @@ void CodeGenModule::mergeDefaultFunctionDefinitionAttributes(
llvm::AttrBuilder FuncAttrs(F.getContext());
getTrivialDefaultFunctionAttributes(F.getName(), F.hasOptNone(),
/*AttrOnCallSite=*/false, FuncAttrs);
GetCPUAndFeaturesAttributes(GlobalDecl(), FuncAttrs);
GetCPUAndFeaturesAttributes(GlobalDecl(), FuncAttrs,
/*AddTargetFeatures=*/false);

if (!WillInternalize && F.isInterposable()) {
// Do not promote "dynamic" denormal-fp-math to this translation unit's
Expand Down
5 changes: 3 additions & 2 deletions clang/lib/CodeGen/CodeGenModule.cpp
Expand Up @@ -2226,7 +2226,8 @@ void CodeGenModule::SetCommonAttributes(GlobalDecl GD, llvm::GlobalValue *GV) {
}

bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
llvm::AttrBuilder &Attrs) {
llvm::AttrBuilder &Attrs,
bool SetTargetFeatures) {
// Add target-cpu and target-features attributes to functions. If
// we have a decl for the function and it has a target attribute then
// parse that and add it to the feature set.
Expand Down Expand Up @@ -2286,7 +2287,7 @@ bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
Attrs.addAttribute("tune-cpu", TuneCPU);
AddedAttr = true;
}
if (!Features.empty()) {
if (!Features.empty() && SetTargetFeatures) {
llvm::sort(Features);
Attrs.addAttribute("target-features", llvm::join(Features, ","));
AddedAttr = true;
Expand Down
3 changes: 2 additions & 1 deletion clang/lib/CodeGen/CodeGenModule.h
Expand Up @@ -1583,7 +1583,8 @@ class CodeGenModule : public CodeGenTypeCache {
ForDefinition_t IsForDefinition = NotForDefinition);

bool GetCPUAndFeaturesAttributes(GlobalDecl GD,
llvm::AttrBuilder &AttrBuilder);
llvm::AttrBuilder &AttrBuilder,
bool SetTargetFeatures = true);
void setNonAliasAttributes(GlobalDecl GD, llvm::GlobalObject *GO);

/// Set function attributes for a function declaration.
Expand Down
7 changes: 7 additions & 0 deletions clang/test/CodeGenCUDA/Inputs/ocml-sample-target-attrs.cl
@@ -0,0 +1,7 @@
typedef unsigned long ulong;

__attribute__((target("gfx11-insts")))
ulong do_intrin_stuff(void)
{
return __builtin_amdgcn_s_sendmsg_rtnl(0x0);
}
28 changes: 18 additions & 10 deletions clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu
Expand Up @@ -132,24 +132,32 @@ __global__ void kernel_f64(double* out, double* a, double* b, double* c) {

// Default mode relies on the implicit check-not for the denormal-fp-math.

// PSZ: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
// PSZ: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
// PSZ: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
// PSZ: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign"
// PSZ-SAME: "target-cpu"="gfx803"
// PSZ: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// PSZ-SAME: "target-cpu"="gfx803"
// PSZ: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// PSZ-SAME: "target-cpu"="gfx803"

// FIXME: Should check-not "denormal-fp-math" within the line
// IEEEF64-PSZF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
// IEEEF64-PSZF32: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
// IEEEF64-PSZF32: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" {{.*}} "target-cpu"="gfx803" {{.*}} }
// IEEEF64-PSZF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
// IEEEF64-PSZF32: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
// IEEEF64-PSZF32: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"

// IEEEF32-PSZF64-DYNF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }
// implicit check-not
// implicit check-not


// IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }
// IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }
// IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }

// IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
// IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
// IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"

// -mlink-bitcode-file doesn't internalize or propagate attributes.
// NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }
Expand Down
48 changes: 48 additions & 0 deletions clang/test/CodeGenCUDA/link-builtin-bitcode-gpu-attrs-preserved.cu
@@ -0,0 +1,48 @@
// Verify the behavior of the +gfxN-insts in the way that
// rocm-device-libs should be built with. e.g. If the device libraries has a function
// with "+gfx11-insts", that attribute should still be present after linking and not
// overwritten with the current target's settings.

// This is important because at this time, many device-libs functions that are only
// available on some GPUs put an attribute such as "+gfx11-insts" so that
// AMDGPURemoveIncompatibleFunctions can detect & remove them if needed.

// Build the fake device library in the way rocm-device-libs should be built.
//
// RUN: %clang_cc1 -x cl -triple amdgcn-amd-amdhsa\
// RUN: -mcode-object-version=none -emit-llvm-bc \
// RUN: %S/Inputs/ocml-sample-target-attrs.cl -o %t.bc

// Check the default behavior
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 -fcuda-is-device \
// RUN: -mlink-builtin-bitcode %t.bc \
// RUN: -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,INTERNALIZE

// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx1101 -fcuda-is-device \
// RUN: -mlink-builtin-bitcode %t.bc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,INTERNALIZE

// Check the case where no internalization is performed
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN: -fcuda-is-device -mlink-bitcode-file %t.bc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,NOINTERNALIZE

// Check the case where no internalization is performed
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx1101 \
// RUN: -fcuda-is-device -mlink-bitcode-file %t.bc -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,NOINTERNALIZE


// CHECK: define {{.*}} i64 @do_intrin_stuff() #[[ATTR:[0-9]+]]
// INTERNALIZE: attributes #[[ATTR]] = {{.*}} "target-cpu"="gfx{{.*}}" "target-features"="+gfx11-insts"
// NOINTERNALIZE: attributes #[[ATTR]] = {{.*}} "target-features"="+gfx11-insts"

#define __device__ __attribute__((device))
#define __global__ __attribute__((global))

typedef unsigned long ulong;

extern "C" {
__device__ ulong do_intrin_stuff(void);

__global__ void kernel_f16(ulong* out) {
*out = do_intrin_stuff();
}
}

0 comments on commit 23431b5

Please sign in to comment.