Skip to content

Commit

Permalink
[AMDGPU] Change default AMDHSA Code Object version to 5 (#79038)
Browse files Browse the repository at this point in the history
Also update LIT tests and docs.
For more details, see
https://llvm.org/docs/AMDGPUUsage.html#code-object-v5-metadata

Corresponding llvm-objdump AMDGPU lit tests are updated
in a follow-up PR.
  • Loading branch information
saiislam committed Jan 23, 2024
1 parent f47c406 commit 082f87c
Show file tree
Hide file tree
Showing 16 changed files with 34 additions and 29 deletions.
3 changes: 3 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,9 @@ AMDGPU Support
arguments in C ABI. Callee is responsible for allocating stack memory and
copying the value of the struct if modified. Note that AMDGPU backend still
supports byval for struct arguments.
- The default value for ``-mcode-object-version`` is now 5.
See `AMDHSA Code Object V5 Metadata <https://llvm.org/docs/AMDGPUUsage.html#code-object-v5-metadata>`_
for more details.

X86 Support
^^^^^^^^^^^
Expand Down
4 changes: 2 additions & 2 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -4781,12 +4781,12 @@ defm amdgpu_ieee : BoolOption<"m", "amdgpu-ieee",
NegFlag<SetFalse, [], [ClangOption, CC1Option]>>, Group<m_Group>;

def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>,
HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">,
HelpText<"Specify code object ABI version. Defaults to 5. (AMDGPU only)">,
Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,
Values<"none,4,5">,
NormalizedValuesScope<"llvm::CodeObjectVersionKind">,
NormalizedValues<["COV_None", "COV_4", "COV_5"]>,
MarshallingInfoEnum<TargetOpts<"CodeObjectVersion">, "COV_4">;
MarshallingInfoEnum<TargetOpts<"CodeObjectVersion">, "COV_5">;

defm cumode : SimpleMFlag<"cumode",
"Specify CU wavefront", "Specify WGP wavefront",
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGen/amdgpu-address-spaces.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ int [[clang::address_space(999)]] bbb = 1234;
// CHECK: @u = addrspace(5) global i32 undef, align 4
// CHECK: @aaa = addrspace(6) global i32 1000, align 4
// CHECK: @bbb = addrspace(999) global i32 1234, align 4
// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
//.
// CHECK-LABEL: define dso_local amdgpu_kernel void @foo(
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Create module flag for code object version.

// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
// RUN: -o - %s | FileCheck %s -check-prefix=V4
// RUN: -o - %s | FileCheck %s -check-prefix=V5

// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
// RUN: -mcode-object-version=4 -o - %s | FileCheck -check-prefix=V4 %s
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN: -fcuda-is-device -mcode-object-version=4 -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefix=PRECOV5 %s


// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
// RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \
// RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefix=COV5 %s

// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenHIP/default-attributes.hip
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ __global__ void kernel() {
// OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
//.
// OPTNONE: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
// OPTNONE: !0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4}
//.
// OPT: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
// OPT: !0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
// OPT: !2 = !{i32 1, !"wchar_size", i32 4}
//.
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -703,7 +703,7 @@ kernel void test_target_features_kernel(global int *i) {
// GFX900: attributes #8 = { nounwind }
// GFX900: attributes #9 = { convergent nounwind }
//.
// NOCPU: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
// NOCPU: !0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
// NOCPU: !1 = !{i32 1, !"wchar_size", i32 4}
// NOCPU: !2 = !{i32 2, i32 0}
// NOCPU: !3 = !{i32 1, i32 0, i32 1, i32 0}
Expand All @@ -721,7 +721,7 @@ kernel void test_target_features_kernel(global int *i) {
// NOCPU: !15 = !{i32 1}
// NOCPU: !16 = !{!"int*"}
//.
// GFX900: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
// GFX900: !0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
// GFX900: !1 = !{i32 1, !"wchar_size", i32 4}
// GFX900: !2 = !{i32 2, i32 0}
// GFX900: !3 = !{!4, !4, i64 0}
Expand Down
10 changes: 5 additions & 5 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -601,13 +601,13 @@ void test_get_local_id(int d, global int *out)
}

// CHECK-LABEL: @test_get_workgroup_size(
// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 4
// CHECK: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 12
// CHECK: load i16, ptr addrspace(4) %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 6
// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 14
// CHECK: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 8
// CHECK: load i16, ptr addrspace(4) %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 16
// CHECK: load i16, ptr addrspace(4) %{{.*}}, align 8, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
void test_get_workgroup_size(int d, global int *out)
{
switch (d) {
Expand Down
2 changes: 1 addition & 1 deletion flang/test/Driver/driver-help-hidden.f90
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
! CHECK-NEXT: -L <dir> Add directory to library search path
! CHECK-NEXT: -march=<value> For a list of available architectures for the target use '-mcpu=help'
! CHECK-NEXT: -mcode-object-version=<value>
! CHECK-NEXT: Specify code object ABI version. Defaults to 4. (AMDGPU only)
! CHECK-NEXT: Specify code object ABI version. Defaults to 5. (AMDGPU only)
! CHECK-NEXT: -mcpu=<value> For a list of available CPUs for the target use '-mcpu=help'
! CHECK-NEXT: -mllvm=<arg> Alias for -mllvm
! CHECK-NEXT: -mllvm <value> Additional arguments to forward to LLVM's option processing
Expand Down
4 changes: 2 additions & 2 deletions flang/test/Driver/driver-help.f90
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@
! HELP-NEXT: -L <dir> Add directory to library search path
! HELP-NEXT: -march=<value> For a list of available architectures for the target use '-mcpu=help'
! HELP-NEXT: -mcode-object-version=<value>
! HELP-NEXT: Specify code object ABI version. Defaults to 4. (AMDGPU only)
! HELP-NEXT: Specify code object ABI version. Defaults to 5. (AMDGPU only)
! HELP-NEXT: -mcpu=<value> For a list of available CPUs for the target use '-mcpu=help'
! HELP-NEXT: -mllvm=<arg> Alias for -mllvm
! HELP-NEXT: -mllvm <value> Additional arguments to forward to LLVM's option processing
Expand Down Expand Up @@ -240,7 +240,7 @@
! HELP-FC1-NEXT: -I <dir> Add directory to the end of the list of include search paths
! HELP-FC1-NEXT: -load <dsopath> Load the named plugin (dynamic shared object)
! HELP-FC1-NEXT: -mcode-object-version=<value>
! HELP-FC1-NEXT: Specify code object ABI version. Defaults to 4. (AMDGPU only)
! HELP-FC1-NEXT: Specify code object ABI version. Defaults to 5. (AMDGPU only)
! HELP-FC1-NEXT: -menable-no-infs Allow optimization to assume there are no infinities.
! HELP-FC1-NEXT: -menable-no-nans Allow optimization to assume there are no NaNs.
! HELP-FC1-NEXT: -mframe-pointer=<value> Specify which frame pointers to retain.
Expand Down
15 changes: 7 additions & 8 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1510,12 +1510,12 @@ The AMDGPU backend uses the following ELF header:

* ``ELFABIVERSION_AMDGPU_HSA_V4`` is used to specify the version of AMD HSA
runtime ABI for code object V4. Specify using the Clang option
``-mcode-object-version=4``. This is the default code object
version if not specified.
``-mcode-object-version=4``.

* ``ELFABIVERSION_AMDGPU_HSA_V5`` is used to specify the version of AMD HSA
runtime ABI for code object V5. Specify using the Clang option
``-mcode-object-version=5``.
``-mcode-object-version=5``. This is the default code object
version if not specified.

* ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL
runtime ABI.
Expand Down Expand Up @@ -3949,6 +3949,10 @@ same *vendor-name*.
Code Object V4 Metadata
+++++++++++++++++++++++

. warning::
Code object V4 is not the default code object version emitted by this version
of LLVM.

Code object V4 metadata is the same as
:ref:`amdgpu-amdhsa-code-object-metadata-v3` with the changes and additions
defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v4`.
Expand Down Expand Up @@ -3979,11 +3983,6 @@ defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v4`.
Code Object V5 Metadata
+++++++++++++++++++++++

.. warning::
Code object V5 is not the default code object version emitted by this version
of LLVM.


Code object V5 metadata is the same as
:ref:`amdgpu-amdhsa-code-object-metadata-v4` with the changes defined in table
:ref:`amdgpu-amdhsa-code-object-metadata-map-table-v5`, table
Expand Down
2 changes: 2 additions & 0 deletions llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ Changes to the AMDGPU Backend

* Implemented :ref:`llvm.get.rounding <int_get_rounding>`

* The default :ref:`AMDHSA code object version <amdgpu-amdhsa-code-object-metadata-v5>` is now 5.

Changes to the ARM Backend
--------------------------

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

static llvm::cl::opt<unsigned> DefaultAMDHSACodeObjectVersion(
"amdhsa-code-object-version", llvm::cl::Hidden,
llvm::cl::init(llvm::AMDGPU::AMDHSA_COV4),
llvm::cl::init(llvm::AMDGPU::AMDHSA_COV5),
llvm::cl::desc("Set default AMDHSA Code Object Version (module flag "
"or asm directive still take priority if present)"));

Expand Down
2 changes: 1 addition & 1 deletion mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) {

// This constant must always match the default code object ABI version
// of the AMDGPU backend.
addControlConstant("__oclc_ABI_version", 400, 32);
addControlConstant("__oclc_ABI_version", 500, 32);
}

// Determine libraries we need to link - order matters due to dependencies
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class ROCDLDialectLLVMIRTranslationInterface
if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
}
llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "256");
}
// Override flat-work-group-size
// TODO: update clients to rocdl.flat_work_group_size instead,
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Target/LLVMIR/rocdl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
llvm.return %source5 : i32
}

// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="256" }
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
Expand Down

0 comments on commit 082f87c

Please sign in to comment.