diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index a9c03fa483e7c..e9b13c3adcbaa 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2177,7 +2177,7 @@ def int_amdgcn_mov_dpp : // v_mov_b32 // v_mov_b32 def int_amdgcn_update_dpp : - Intrinsic<[llvm_anyint_ty], + Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 6275daee14426..cf47b2e1cd2cf 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1203,8 +1203,8 @@ def : GCNPat < (as_i1timm $bound_ctrl)) >; -def : GCNPat < - (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, +class UpdateDPPPat : GCNPat < + (vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask, timm:$bound_ctrl)), (V_MOV_B32_dpp VGPR_32:$old, VGPR_32:$src, (as_i32timm $dpp_ctrl), @@ -1212,6 +1212,9 @@ def : GCNPat < (as_i1timm $bound_ctrl)) >; +def : UpdateDPPPat; +def : UpdateDPPPat; + } // End OtherPredicates = [isGFX8Plus] let OtherPredicates = [isGFX8Plus] in { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll index 8472271a89e89..e1dd299629472 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -11,7 +11,7 @@ ; GFX8-NOOPT: s_nop 1 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { - %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 0) #0 + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) #0 store i32 %tmp0, ptr addrspace(1) %out ret void } @@ -24,7 +24,7 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-NOOPT: s_nop 1 ; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[2,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1{{$}} define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2) { - %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 1) #0 + %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 2, i32 1, i32 1, i1 true) #0 store i32 %tmp0, ptr addrspace(1) %out ret void } @@ -63,7 +63,7 @@ define amdgpu_kernel void @update_dpp64_test(ptr addrspace(1) %arg, i64 %in1, i6 %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id %load = load i64, ptr addrspace(1) %gep - %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 0) #0 + %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 %load, i32 1, i32 1, i32 1, i1 false) #0 store i64 %tmp0, ptr addrspace(1) %gep ret void } @@ -83,7 +83,7 @@ define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64 %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id %load = load i64, ptr addrspace(1) %gep - %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 0) #0 + %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 123451234512345, i64 %load, i32 1, i32 1, i32 1, i1 false) #0 store i64 %tmp0, ptr addrspace(1) %gep ret void } @@ -98,14 +98,133 @@ define amdgpu_kernel void @update_dpp64_imm_old_test(ptr addrspace(1) %arg, i64 ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_LO]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} ; GCN-NOOPT-DAG: v_mov_b32_dpp v{{[0-9]+}}, v[[SRC_HI]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} define amdgpu_kernel void @update_dpp64_imm_src_test(ptr addrspace(1) %out, i64 %in1) { - %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 0) #0 + %tmp0 = call i64 @llvm.amdgcn.update.dpp.i64(i64 %in1, i64 123451234512345, i32 1, i32 1, i32 1, i1 false) #0 store i64 %tmp0, ptr addrspace(1) %out ret void } +; GCN-LABEL: {{^}}dpp_test_f32: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1{{$}} +define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float %in2) { + %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 1, i32 1, i1 false) + store float %tmp0, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}dpp_test_f32_imm_comb1: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,0] row_mask:0x0 bank_mask:0x0{{$}} +define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %in1, float %in2) { + %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 0, i32 0, i32 0, i1 false) + store float %tmp0, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}dpp_test_f32_imm_comb2: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,0,0,0] row_mask:0x3 bank_mask:0x3{{$}} +define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %in1, float %in2) { + %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 3, i32 3, i32 3, i1 false) + store float %tmp0, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}dpp_test_f32_imm_comb3: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[1,0,0,0] row_mask:0x2 bank_mask:0x3 bound_ctrl:1{{$}} +define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %in1, float %in2) { + %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 1, i32 2, i32 3, i1 true) + store float %tmp0, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}dpp_test_f32_imm_comb4: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,1,0,0] row_mask:0x3 bank_mask:0x2 bound_ctrl:1{{$}} +define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %in1, float %in2) { + %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 4, i32 3, i32 2, i1 true) + store float %tmp0, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}dpp_test_f32_imm_comb5: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xe bank_mask:0xd bound_ctrl:1{{$}} +define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %in1, float %in2) { + %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 62, i32 61, i1 true) + store float %tmp0, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}dpp_test_f32_imm_comb6: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,3,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}} +define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %in1, float %in2) { + %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 63, i32 63, i32 63, i1 true) + store float %tmp0, ptr addrspace(1) %out + ret void +} + + +; GCN-LABEL: {{^}}dpp_test_f32_imm_comb7: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[0,0,0,1] row_mask:0x0 bank_mask:0x0 bound_ctrl:1{{$}} +define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %in1, float %in2) { + %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 64, i32 64, i32 64, i1 true) + store float %tmp0, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}dpp_test_f32_imm_comb8: +; GCN: v_mov_b32_e32 [[DST:v[0-9]+]], s{{[0-9]+}} +; GCN: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} +; GFX8-OPT: s_mov +; GFX8-OPT: s_mov +; GFX8-NOOPT: s_nop 1 +; GCN: v_mov_b32_dpp [[DST]], [[SRC]] quad_perm:[3,3,1,0] row_mask:0xf bank_mask:0x0 bound_ctrl:1{{$}} +define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %in1, float %in2) { + %tmp0 = call float @llvm.amdgcn.update.dpp.f32(float %in1, float %in2, i32 31, i32 63, i32 128, i1 true) + store float %tmp0, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() declare void @llvm.amdgcn.s.barrier() declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0 +declare float @llvm.amdgcn.update.dpp.f32(float, float, i32, i32, i32, i1) #0 declare i64 @llvm.amdgcn.update.dpp.i64(i64, i64, i32, i32, i32, i1) #0 attributes #0 = { nounwind readnone convergent }