Skip to content

Commit

Permalink
[AMDGPU] Extend f32 support for llvm.amdgcn.update.dpp intrinsic
Browse files Browse the repository at this point in the history
This will be useful to avoid the bit-casting noise
required to extend support for Floating Point
Operations in atomic optimizer for DPP in D156301

Reviewed By: arsenm, #amdgpu

Differential Revision: https://reviews.llvm.org/D156647
  • Loading branch information
Pravin Jagtap authored and Pravin Jagtap committed Aug 17, 2023
1 parent 81827f8 commit af5fd14
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 8 deletions.
2 changes: 1 addition & 1 deletion llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2177,7 +2177,7 @@ def int_amdgcn_mov_dpp :
// v_mov_b32 <dest> <old>
// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
def int_amdgcn_update_dpp :
Intrinsic<[llvm_anyint_ty],
Intrinsic<[llvm_any_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty,
llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn,
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1203,15 +1203,18 @@ def : GCNPat <
(as_i1timm $bound_ctrl))
>;

def : GCNPat <
(i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl,
class UpdateDPPPat<ValueType vt> : GCNPat <
(vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl,
timm:$row_mask, timm:$bank_mask,
timm:$bound_ctrl)),
(V_MOV_B32_dpp VGPR_32:$old, VGPR_32:$src, (as_i32timm $dpp_ctrl),
(as_i32timm $row_mask), (as_i32timm $bank_mask),
(as_i1timm $bound_ctrl))
>;

def : UpdateDPPPat<i32>;
def : UpdateDPPPat<f32>;

} // End OtherPredicates = [isGFX8Plus]

let OtherPredicates = [isGFX8Plus] in {
Expand Down
129 changes: 124 additions & 5 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0
store i32 %tmp0, ptr addrspace(1) %out
ret void
}
Expand All @@ -24,7 +24,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 true) #0
store i32 %tmp0, ptr addrspace(1) %out
ret void
}
Expand Down Expand Up @@ -63,7 +63,7 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
%load = load i64, ptr addrspace(1) %gep
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
store i64 %tmp0, ptr addrspace(1) %gep
ret void
}
Expand All @@ -83,7 +83,7 @@ define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
%load = load i64, ptr addrspace(1) %gep
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 0) #0
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 false) #0
store i64 %tmp0, ptr addrspace(1) %gep
ret void
}
Expand All @@ -98,14 +98,133 @@ define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
define amdgpu_kernel void @update_dpp64_imm_src_test(ptr addrspace(1) %out, i64 %in1) {
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0
%tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0
store i64 %tmp0, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}dpp_test_f32:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
; GFX8-OPT: s_mov
; GFX8-OPT: s_mov
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}}
define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float %in2) {
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 1, i32 1, i1 false)
store float %tmp0, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}dpp_test_f32_imm_comb1:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
; GFX8-OPT: s_mov
; GFX8-OPT: s_mov
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}}
define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %in1, float %in2) {
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 0, i32 0, i32 0, i1 false)
store float %tmp0, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}dpp_test_f32_imm_comb2:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
; GFX8-OPT: s_mov
; GFX8-OPT: s_mov
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}}
define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %in1, float %in2) {
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 3, i32 3, i32 3, i1 false)
store float %tmp0, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}dpp_test_f32_imm_comb3:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
; GFX8-OPT: s_mov
; GFX8-OPT: s_mov
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %in1, float %in2) {
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 2, i32 3, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}dpp_test_f32_imm_comb4:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
; GFX8-OPT: s_mov
; GFX8-OPT: s_mov
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %in1, float %in2) {
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 4, i32 3, i32 2, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}dpp_test_f32_imm_comb5:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
; GFX8-OPT: s_mov
; GFX8-OPT: s_mov
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) {
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 62, i32 61, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}dpp_test_f32_imm_comb6:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
; GFX8-OPT: s_mov
; GFX8-OPT: s_mov
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) {
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 63, i32 63, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
}


; GCN-LABEL: {{^}}dpp_test_f32_imm_comb7:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
; GFX8-OPT: s_mov
; GFX8-OPT: s_mov
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) {
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 64, i32 64, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
}

; GCN-LABEL: {{^}}dpp_test_f32_imm_comb8:
; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}}
; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
; GFX8-OPT: s_mov
; GFX8-OPT: s_mov
; GFX8-NOOPT: s_nop 1
; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) {
%tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 63, i32 128, i1 true)
store float %tmp0, ptr addrspace(1) %out
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x()
declare void @llvm.amdgcn.s.barrier()
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0
declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0

attributes #0 = { nounwind readnone convergent }

0 comments on commit af5fd14

Please sign in to comment.