diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 82372b09899101..068f206f44847d 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -227,6 +227,7 @@ LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr function LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP") +LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 808cca76c6be1a..e48817931efd35 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -606,6 +606,9 @@ def fno_hip_new_launch_api : Flag<["-"], "fno-hip-new-launch-api">; def fgpu_allow_device_init : Flag<["-"], "fgpu-allow-device-init">, Flags<[CC1Option]>, HelpText<"Allow device side init function in HIP">; def fno_gpu_allow_device_init : Flag<["-"], "fno-gpu-allow-device-init">; +def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">, + Flags<[CC1Option]>, + HelpText<"Default max threads per block for kernel launch bounds for HIP">; def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group, HelpText<"Path to libomptarget-nvptx libraries">; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp index 6c6400652a6d6b..7068fa0fcc6920 100644 --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -8072,8 +8072,11 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes( } else assert(Max == 0 && "Max must be zero"); } else if (IsOpenCLKernel || IsHIPKernel) { - // By default, restrict the maximum size to 256. - F->addFnAttr("amdgpu-flat-work-group-size", "1,256"); + // By default, restrict the maximum size to a value specified by + // --gpu-max-threads-per-block=n or its default value. + std::string AttrVal = + std::string("1,") + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock); + F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); } if (const auto *Attr = FD->getAttr()) { diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp index f68b5cd681846f..f89e648948aba0 100644 --- a/clang/lib/Driver/ToolChains/HIP.cpp +++ b/clang/lib/Driver/ToolChains/HIP.cpp @@ -307,6 +307,14 @@ void HIPToolChain::addClangTargetOptions( false)) CC1Args.push_back("-fgpu-rdc"); + StringRef MaxThreadsPerBlock = + DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ); + if (!MaxThreadsPerBlock.empty()) { + std::string ArgStr = + std::string("--gpu-max-threads-per-block=") + MaxThreadsPerBlock.str(); + CC1Args.push_back(DriverArgs.MakeArgStringRef(ArgStr)); + } + if (DriverArgs.hasFlag(options::OPT_fgpu_allow_device_init, options::OPT_fno_gpu_allow_device_init, false)) CC1Args.push_back("-fgpu-allow-device-init"); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 5f332aff75c24a..6f6f43ca284ba2 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -2559,6 +2559,12 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, << Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args); } Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api); + if (Opts.HIP) + Opts.GPUMaxThreadsPerBlock = getLastArgIntValue( + Args, OPT_gpu_max_threads_per_block_EQ, Opts.GPUMaxThreadsPerBlock); + else if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ)) + Diags.Report(diag::warn_ignored_hip_only_option) + << Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args); if (Opts.ObjC) { if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) { diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu index 70eb9091d8d496..ece8685932d260 100644 --- a/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ b/clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -1,13 +1,21 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \ -// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \ +// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s // RUN: %clang_cc1 -triple nvptx \ // RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \ // RUN: -check-prefix=NAMD // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \ -// RUN: -verify -o - %s | FileCheck -check-prefix=NAMD %s +// RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s #include "Inputs/cuda.h" +__global__ void flat_work_group_size_default() { +// CHECK: define amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]] +} + __attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics __global__ void flat_work_group_size_32_64() { // CHECK: define amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]] @@ -31,7 +39,9 @@ __global__ void num_vgpr_64() { // NAMD-NOT: "amdgpu-num-vgpr" // NAMD-NOT: "amdgpu-num-sgpr" -// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = { convergent noinline nounwind optnone "amdgpu-flat-work-group-size"="32,64" -// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = { convergent noinline nounwind optnone "amdgpu-waves-per-eu"="2" -// CHECK-DAG: attributes [[NUM_SGPR_32]] = { convergent noinline nounwind optnone "amdgpu-num-sgpr"="32" -// CHECK-DAG: attributes [[NUM_VGPR_64]] = { convergent noinline nounwind optnone "amdgpu-num-vgpr"="64" +// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256" +// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" +// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" +// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" +// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32" +// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64" diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip new file mode 100644 index 00000000000000..b2ad0424b30610 --- /dev/null +++ b/clang/test/Driver/hip-options.hip @@ -0,0 +1,10 @@ +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: amdgpu-registered-target + +// RUN: %clang -### -x hip --gpu-max-threads-per-block=1024 %s 2>&1 | FileCheck %s + +// Check that there are commands for both host- and device-side compilations. +// +// CHECK: clang{{.*}}" "-cc1" {{.*}} "-fcuda-is-device" +// CHECK-SAME: "--gpu-max-threads-per-block=1024"