Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,10 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b64, "V2UiQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_load_b96, "V3UiQbiiIi", "n")
BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n")

TARGET_BUILTIN(__builtin_amdgcn_raw_buffer_load_lds, "vV4Uiv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't want the non-pointer form spreading, every user should migrate to using real pointers

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say that's a difficult tradeoff. I'd expect the integer version to work more reliably and the pointer one to generate better code, as that's usually how int2ptr hackery works out.

Would you prefer the integer intrinsic getting more use via the builtin or people continuing to use assembly? My understanding is that CK are using the compiler where they can and bypassing it where they can't, which rather suggests our compiler is not totally meeting their use cases.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found some context, are we partway through replacing v4i32 with an addrspace8 ptr, and want to expand uses of v4i32 into the "new" one for a while to avoid breaking as much library code?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... I think we need to go update a document saying the v4i32 forms of the buffer intrinsics are deprecated in favor of the ptr addrspace(8) forms

... and then go fix LLPC

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, the previous ptrtoint hackery is entirely encapsulated by __builtin_amdgcn_make_buffer_rsrc(ptr, i16, i32, i32)

TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts")
TARGET_BUILTIN(__builtin_amdgcn_struct_buffer_load_lds, "vV4Uiv*3IUiiiiIiIi", "t", "vmem-to-lds-load-insts")
TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi", "t", "vmem-to-lds-load-insts")

//===----------------------------------------------------------------------===//
// Ballot builtins.
Expand Down
5 changes: 4 additions & 1 deletion clang/lib/Sema/SemaAMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,11 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
Builtin::evaluateRequiredTargetFeatures("gfx950-insts", CallerFeatureMap);

switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_lds:
case AMDGPU::BI__builtin_amdgcn_raw_buffer_load_lds:
case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds:
case AMDGPU::BI__builtin_amdgcn_global_load_lds: {
case AMDGPU::BI__builtin_amdgcn_struct_ptr_buffer_load_lds:
case AMDGPU::BI__builtin_amdgcn_struct_buffer_load_lds: {
constexpr const int SizeIdx = 2;
llvm::APSInt Size;
Expr *ArgExpr = TheCall->getArg(SizeIdx);
Expand Down
29 changes: 29 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -emit-llvm -o - %s | FileCheck %s

typedef unsigned int v4u32 __attribute__((ext_vector_type(4)));

// CHECK-LABEL: @test_amdgcn_raw_buffer_load_lds(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 1, i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
// CHECK-NEXT: ret void
//
void test_amdgcn_raw_buffer_load_lds(v4u32 rsrc, __local void * lds, int offset, int soffset) {
__builtin_amdgcn_raw_buffer_load_lds(rsrc, lds, 1, offset, soffset, 2, 3);
}

// CHECK-LABEL: @test_amdgcn_raw_ptr_buffer_load_lds(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.amdgcn.raw.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 1, i32 [[OFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
Expand All @@ -10,3 +21,21 @@
void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int offset, int soffset) {
__builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 1, offset, soffset, 2, 3);
}

// CHECK-LABEL: @test_amdgcn_struct_buffer_load_lds(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.amdgcn.struct.buffer.load.lds(<4 x i32> [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 4, i32 [[VINDEX:%.*]], i32 [[VOFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
// CHECK-NEXT: ret void
//
void test_amdgcn_struct_buffer_load_lds(v4u32 rsrc, __local void * lds, int size, int vindex, int voffset, int soffset) {
__builtin_amdgcn_struct_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, 2, 3);
}

// CHECK-LABEL: @test_amdgcn_struct_ptr_buffer_load_lds(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 4, i32 [[VINDEX:%.*]], i32 [[VOFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3)
// CHECK-NEXT: ret void
//
void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int size, int vindex, int voffset, int soffset) {
__builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, 2, 3);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tahiti -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu bonaire -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu carrizo -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -verify -o - %s
// REQUIRES: amdgpu-registered-target

typedef unsigned int v4u32 __attribute__((ext_vector_type(4)));

void test_amdgcn_raw_buffer_load_lds(v4u32 rsrc, __local void* lds, int offset, int soffset, int x) {
__builtin_amdgcn_raw_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,32 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s
// REQUIRES: amdgpu-registered-target

typedef unsigned int v4u32 __attribute__((ext_vector_type(4)));

void test_amdgcn_raw_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) {
__builtin_amdgcn_raw_buffer_load_lds(rsrc, lds, x, offset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_raw_buffer_load_lds(rsrc, lds, 4, offset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_raw_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_raw_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_raw_buffer_load_lds(rsrc, lds, 3, offset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}}
}

void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) {
__builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, x, offset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 3, offset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}}
}

void test_amdgcn_struct_buffer_load_lds(v4u32 rsrc, __local void* lds, int index, int offset, int soffset, int x) {
__builtin_amdgcn_struct_buffer_load_lds(rsrc, lds, x, index, offset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_struct_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_struct_buffer_load_lds(rsrc, lds, 4, index, offset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_struct_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_struct_buffer_load_lds(rsrc, lds, 4, index, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_struct_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_struct_buffer_load_lds(rsrc, lds, 3, index, offset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}}
}

void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int index, int offset, int soffset, int x) {
__builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, x, index, offset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, index, offset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, index, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}}
__builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 3, index, offset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tahiti -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu bonaire -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu carrizo -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -verify -o - %s
// REQUIRES: amdgpu-registered-target

typedef unsigned int v4u32 __attribute__((ext_vector_type(4)));

void test_amdgcn_struct_buffer_load_lds(v4u32 rsrc, __local void* lds, int index, int offset, int soffset, int x) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need three files for this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd love to say no, since they're almost identical and share the first N lines, but if I put them in a single file clang bails on the first error and thus the lit test fails (as the later three diagnostics are not emitted). I don't understand that behaviour but after an hour or so iterating random guesswork and looking for hints in other tests I thought I'd post the review like this. Either it ships or someone knows how to make clang+lit do the sane thing here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but if I put them in a single file clang bails on the first error and thus the lit test fails (

That should not be the case. We have other test cases where we have many error checks in one file, such as clang/test/SemaOpenCL/builtins-amdgcn-error-f16.cl.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or see the other test cases in this PR. I'm not claiming multiple test lines in one test never works, I'm claiming merging these does not work. Please feel free to apply the patch and try it. I don't want to stall the intrinsics on debugging through clang's verify implementation quirks.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should just work

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I am irritated that it does not.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you will need to put everything in one function

__builtin_amdgcn_struct_buffer_load_lds(rsrc, lds, 4, index, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tahiti -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu bonaire -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu carrizo -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -S -verify -o - %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -verify -o - %s
// REQUIRES: amdgpu-registered-target

void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int index, int offset, int soffset, int x) {
__builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, index, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}}
}
12 changes: 9 additions & 3 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1861,7 +1861,9 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic <
ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1>;

class AMDGPURawBufferLoadLDS : Intrinsic <
class AMDGPURawBufferLoadLDS :
ClangBuiltin<"__builtin_amdgcn_raw_buffer_load_lds">,
Intrinsic <
[],
[llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>, // LDS base offset
Expand Down Expand Up @@ -1904,7 +1906,9 @@ class AMDGPURawPtrBufferLoadLDS :
ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS;

class AMDGPUStructBufferLoadLDS : Intrinsic <
class AMDGPUStructBufferLoadLDS :
ClangBuiltin<"__builtin_amdgcn_struct_buffer_load_lds">,
Intrinsic <
[],
[llvm_v4i32_ty, // rsrc(SGPR)
LLVMQualPointerType<3>, // LDS base offset
Expand All @@ -1924,7 +1928,9 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS;

class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
class AMDGPUStructPtrBufferLoadLDS :
ClangBuiltin<"__builtin_amdgcn_struct_ptr_buffer_load_lds">,
Intrinsic <
[],
[AMDGPUBufferRsrcTy, // rsrc(SGPR)
LLVMQualPointerType<3>, // LDS base offset
Expand Down
Loading