Skip to content

Commit

Permalink
AMDGPU: Future-proof {raw,struct}.buffer.atomic intrinsics
Browse files Browse the repository at this point in the history
Summary:
The ISA is really supposed to support 64-bit atomics as well,
so the data type should be an overload.

Mesa doesn't use these atomics yet, in fact I noticed this
issue while trying to use the atomics from Mesa.

Change-Id: I77f58317a085a0d3eb933cc7e99308c48a19f83e

Reviewers: tpr

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, jfb, llvm-commits

Differential Revision: https://reviews.llvm.org/D52291

llvm-svn: 343978
  • Loading branch information
nhaehnle committed Oct 8, 2018
1 parent 46c91fd commit ea36cd5
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 76 deletions.
20 changes: 10 additions & 10 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Expand Up @@ -879,8 +879,8 @@ def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore;
def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore;

class AMDGPURawBufferAtomic : Intrinsic <
[llvm_i32_ty],
[llvm_i32_ty, // vdata(VGPR)
[llvm_anyint_ty],
[LLVMMatchType<0>, // vdata(VGPR)
llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
Expand All @@ -898,9 +898,9 @@ def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
[llvm_i32_ty],
[llvm_i32_ty, // src(VGPR)
llvm_i32_ty, // cmp(VGPR)
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
LLVMMatchType<0>, // cmp(VGPR)
llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling)
Expand All @@ -909,8 +909,8 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
AMDGPURsrcIntrinsic<2, 0>;

class AMDGPUStructBufferAtomic : Intrinsic <
[llvm_i32_ty],
[llvm_i32_ty, // vdata(VGPR)
[llvm_anyint_ty],
[LLVMMatchType<0>, // vdata(VGPR)
llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
Expand All @@ -929,9 +929,9 @@ def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
[llvm_i32_ty],
[llvm_i32_ty, // src(VGPR)
llvm_i32_ty, // cmp(VGPR)
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
LLVMMatchType<0>, // cmp(VGPR)
llvm_v4i32_ty, // rsrc(SGPR)
llvm_i32_ty, // vindex(VGPR)
llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling)
Expand Down
62 changes: 31 additions & 31 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll
Expand Up @@ -15,12 +15,12 @@
;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) {
main_body:
%o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
%o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
%o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%off5 = add i32 %voffset, 42
%o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %off5, i32 0, i32 0)
%o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 4, i32 8188, i32 0)
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
%o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %off5, i32 0, i32 0)
%o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 4, i32 8188, i32 0)
%unused = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %o6 to float
ret float %out
}
Expand All @@ -46,15 +46,15 @@ main_body:
;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 offen glc
define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) {
main_body:
%t1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%t2 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
%t3 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%t4 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
%t5 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%t6 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
%t7 = call i32 @llvm.amdgcn.raw.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%t8 = call i32 @llvm.amdgcn.raw.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
%t9 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%t1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%t2 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
%t3 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%t4 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
%t5 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%t6 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
%t7 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%t8 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2)
%t9 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%out = bitcast i32 %t9 to float
ret float %out
}
Expand All @@ -75,18 +75,18 @@ main_body:
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc
define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
main_body:
%o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
%o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%o1 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
%o3 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
%ofs.5 = add i32 %voffset, 44
%o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %ofs.5, i32 0, i32 0)
%o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 4, i32 8188, i32 0)
%o5 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %ofs.5, i32 0, i32 0)
%o6 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 4, i32 8188, i32 0)

; Detecting the no-return variant doesn't work right now because of how the
; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG.
; Since there probably isn't a reasonable use-case of cmpswap that discards
; the return value, that seems okay.
;
; %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
; %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
%out = bitcast i32 %o6 to float
ret float %out
}
Expand All @@ -95,21 +95,21 @@ main_body:
;CHECK: buffer_atomic_add v0,
define amdgpu_ps float @test4() {
main_body:
%v = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> undef, i32 4, i32 0, i32 0)
%v = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 4, i32 0, i32 0)
%v.float = bitcast i32 %v to float
ret float %v.float
}

declare i32 @llvm.amdgcn.raw.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.and(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.or(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) #0

attributes #0 = { nounwind }
70 changes: 35 additions & 35 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll
Expand Up @@ -19,14 +19,14 @@
;CHECK: buffer_atomic_swap v0, {{v[0-9]+}}, s[0:3], 0 idxen{{$}}
define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
main_body:
%o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0)
%o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0)
%o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0)
%o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0)
%ofs.5 = add i32 %voffset, 42
%o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i32 0, i32 0)
%o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o5, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0)
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i32 0, i32 0)
%o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0)
%unused = call i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%out = bitcast i32 %o6 to float
ret float %out
}
Expand All @@ -52,15 +52,15 @@ main_body:
;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc
define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
main_body:
%t1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%t2 = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
%t3 = call i32 @llvm.amdgcn.struct.buffer.atomic.smin(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%t4 = call i32 @llvm.amdgcn.struct.buffer.atomic.umin(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
%t5 = call i32 @llvm.amdgcn.struct.buffer.atomic.smax(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%t6 = call i32 @llvm.amdgcn.struct.buffer.atomic.umax(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
%t7 = call i32 @llvm.amdgcn.struct.buffer.atomic.and(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%t8 = call i32 @llvm.amdgcn.struct.buffer.atomic.or(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
%t9 = call i32 @llvm.amdgcn.struct.buffer.atomic.xor(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%t1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%t2 = call i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
%t3 = call i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%t4 = call i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
%t5 = call i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%t6 = call i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
%t7 = call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%t8 = call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2)
%t9 = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%out = bitcast i32 %t9 to float
ret float %out
}
Expand All @@ -85,20 +85,20 @@ main_body:
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc
define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
main_body:
%o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0)
%o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0)
%o1 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%o2 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
%o3 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0)
%o4 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0)
%offs.5 = add i32 %voffset, 44
%o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %offs.5, i32 0, i32 0)
%o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0)
%o5 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %offs.5, i32 0, i32 0)
%o6 = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0)

; Detecting the no-return variant doesn't work right now because of how the
; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG.
; Since there probably isn't a reasonable use-case of cmpswap that discards
; the return value, that seems okay.
;
; %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
; %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
%out = bitcast i32 %o6 to float
ret float %out
}
Expand All @@ -107,21 +107,21 @@ main_body:
;CHECK: buffer_atomic_add v0,
define amdgpu_ps float @test4() {
main_body:
%v = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> undef, i32 0, i32 4, i32 0, i32 0)
%v = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 0, i32 4, i32 0, i32 0)
%v.float = bitcast i32 %v to float
ret float %v.float
}

declare i32 @llvm.amdgcn.struct.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.and(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.or(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32) #0

attributes #0 = { nounwind }

0 comments on commit ea36cd5

Please sign in to comment.