Skip to content

Commit

Permalink
[HIP] Add option --gpu-max-threads-per-block=n
Browse files Browse the repository at this point in the history
Add this option to change the default launch bounds.

Differential Revision: https://reviews.llvm.org/D71221
  • Loading branch information
yxsamliu committed Jan 7, 2020
1 parent ee81180 commit 9f2d8b5
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 8 deletions.
1 change: 1 addition & 0 deletions clang/include/clang/Basic/LangOptions.def
Expand Up @@ -227,6 +227,7 @@ LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr function
LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP")

LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device")

Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/Driver/Options.td
Expand Up @@ -606,6 +606,9 @@ def fno_hip_new_launch_api : Flag<["-"], "fno-hip-new-launch-api">;
def fgpu_allow_device_init : Flag<["-"], "fgpu-allow-device-init">,
Flags<[CC1Option]>, HelpText<"Allow device side init function in HIP">;
def fno_gpu_allow_device_init : Flag<["-"], "fno-gpu-allow-device-init">;
def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">,
Flags<[CC1Option]>,
HelpText<"Default max threads per block for kernel launch bounds for HIP">;
def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group<i_Group>,
HelpText<"Path to libomptarget-nvptx libraries">;
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
Expand Down
7 changes: 5 additions & 2 deletions clang/lib/CodeGen/TargetInfo.cpp
Expand Up @@ -8072,8 +8072,11 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
} else
assert(Max == 0 && "Max must be zero");
} else if (IsOpenCLKernel || IsHIPKernel) {
// By default, restrict the maximum size to 256.
F->addFnAttr("amdgpu-flat-work-group-size", "1,256");
// By default, restrict the maximum size to a value specified by
// --gpu-max-threads-per-block=n or its default value.
std::string AttrVal =
std::string("1,") + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock);
F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
}

if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) {
Expand Down
8 changes: 8 additions & 0 deletions clang/lib/Driver/ToolChains/HIP.cpp
Expand Up @@ -307,6 +307,14 @@ void HIPToolChain::addClangTargetOptions(
false))
CC1Args.push_back("-fgpu-rdc");

StringRef MaxThreadsPerBlock =
DriverArgs.getLastArgValue(options::OPT_gpu_max_threads_per_block_EQ);
if (!MaxThreadsPerBlock.empty()) {
std::string ArgStr =
std::string("--gpu-max-threads-per-block=") + MaxThreadsPerBlock.str();
CC1Args.push_back(DriverArgs.MakeArgStringRef(ArgStr));
}

if (DriverArgs.hasFlag(options::OPT_fgpu_allow_device_init,
options::OPT_fno_gpu_allow_device_init, false))
CC1Args.push_back("-fgpu-allow-device-init");
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Frontend/CompilerInvocation.cpp
Expand Up @@ -2559,6 +2559,12 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
<< Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args);
}
Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api);
if (Opts.HIP)
Opts.GPUMaxThreadsPerBlock = getLastArgIntValue(
Args, OPT_gpu_max_threads_per_block_EQ, Opts.GPUMaxThreadsPerBlock);
else if (Args.hasArg(OPT_gpu_max_threads_per_block_EQ))
Diags.Report(diag::warn_ignored_hip_only_option)
<< Args.getLastArg(OPT_gpu_max_threads_per_block_EQ)->getAsString(Args);

if (Opts.ObjC) {
if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
Expand Down
22 changes: 16 additions & 6 deletions clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -1,13 +1,21 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=CHECK,DEFAULT %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa --gpu-max-threads-per-block=1024 \
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefixes=CHECK,MAX1024 %s
// RUN: %clang_cc1 -triple nvptx \
// RUN: -fcuda-is-device -emit-llvm -o - %s | FileCheck %s \
// RUN: -check-prefix=NAMD
// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
// RUN: -verify -o - %s | FileCheck -check-prefix=NAMD %s
// RUN: -verify -o - -x hip %s | FileCheck -check-prefix=NAMD %s

#include "Inputs/cuda.h"

__global__ void flat_work_group_size_default() {
// CHECK: define amdgpu_kernel void @_Z28flat_work_group_size_defaultv() [[FLAT_WORK_GROUP_SIZE_DEFAULT:#[0-9]+]]
}

__attribute__((amdgpu_flat_work_group_size(32, 64))) // expected-no-diagnostics
__global__ void flat_work_group_size_32_64() {
// CHECK: define amdgpu_kernel void @_Z26flat_work_group_size_32_64v() [[FLAT_WORK_GROUP_SIZE_32_64:#[0-9]+]]
Expand All @@ -31,7 +39,9 @@ __global__ void num_vgpr_64() {
// NAMD-NOT: "amdgpu-num-vgpr"
// NAMD-NOT: "amdgpu-num-sgpr"

// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = { convergent noinline nounwind optnone "amdgpu-flat-work-group-size"="32,64"
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = { convergent noinline nounwind optnone "amdgpu-waves-per-eu"="2"
// CHECK-DAG: attributes [[NUM_SGPR_32]] = { convergent noinline nounwind optnone "amdgpu-num-sgpr"="32"
// CHECK-DAG: attributes [[NUM_VGPR_64]] = { convergent noinline nounwind optnone "amdgpu-num-vgpr"="64"
// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}}"amdgpu-num-sgpr"="32"
// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}}"amdgpu-num-vgpr"="64"
10 changes: 10 additions & 0 deletions clang/test/Driver/hip-options.hip
@@ -0,0 +1,10 @@
// REQUIRES: clang-driver
// REQUIRES: x86-registered-target
// REQUIRES: amdgpu-registered-target

// RUN: %clang -### -x hip --gpu-max-threads-per-block=1024 %s 2>&1 | FileCheck %s

// Check that there are commands for both host- and device-side compilations.
//
// CHECK: clang{{.*}}" "-cc1" {{.*}} "-fcuda-is-device"
// CHECK-SAME: "--gpu-max-threads-per-block=1024"

0 comments on commit 9f2d8b5

Please sign in to comment.