[AMDGPU] Expose RTZ version of f16 interpolation #86614

ruiling · 2024-03-26T01:53:35Z

No description provided.

llvmbot · 2024-03-26T01:54:05Z

@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-amdgpu

Author: Ruiling, Song (ruiling)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/86614.diff

5 Files Affected:

(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+18)
(modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+5-1)
(modified) llvm/lib/Target/AMDGPU/VINTERPInstructions.td (+6)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll (+30)
(modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll (+30)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bda3b066b77636..38bcb680815e33 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2066,6 +2066,24 @@ def int_amdgcn_interp_inreg_p2_f16 :
             [IntrNoMem, IntrSpeculatable,
              ImmArg<ArgIndex<3>>]>;
 
+// llvm.amdgcn.interp.inreg.p10.rtz.f16 <p>, <i>, <p0>, <high>
+// fp16 interpolation intrinsic, but round-toward-zero.
+// high selects whether high or low 16-bits are used for p and p0 operands
+def int_amdgcn_interp_inreg_p10_rtz_f16:
+  DefaultAttrsIntrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+            [IntrNoMem, IntrSpeculatable,
+             ImmArg<ArgIndex<3>>]>;
+
+// llvm.amdgcn.interp.inreg.p2.rtz.f16 <p>, <j>, <tmp>, <high>
+// fp16 interpolation intrinsic, but round-toward-zero.
+// high selects whether high or low 16-bits are used for p operand
+def int_amdgcn_interp_inreg_p2_rtz_f16 :
+  DefaultAttrsIntrinsic<[llvm_half_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
+            [IntrNoMem, IntrSpeculatable,
+             ImmArg<ArgIndex<3>>]>;
+
 // Deprecated: use llvm.amdgcn.live.mask instead.
 def int_amdgcn_ps_live : DefaultAttrsIntrinsic <
   [llvm_i1_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a42e95f140ce99..92f1c463380781 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3135,6 +3135,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     case Intrinsic::amdgcn_interp_inreg_p2:
     case Intrinsic::amdgcn_interp_inreg_p10_f16:
     case Intrinsic::amdgcn_interp_inreg_p2_f16:
+    case Intrinsic::amdgcn_interp_inreg_p10_rtz_f16:
+    case Intrinsic::amdgcn_interp_inreg_p2_rtz_f16:
       applyDefaultMapping(OpdMapper);
       return;
     case Intrinsic::amdgcn_permlane16:
@@ -4778,7 +4780,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_interp_inreg_p10:
     case Intrinsic::amdgcn_interp_inreg_p2:
     case Intrinsic::amdgcn_interp_inreg_p10_f16:
-    case Intrinsic::amdgcn_interp_inreg_p2_f16: {
+    case Intrinsic::amdgcn_interp_inreg_p2_f16:
+    case Intrinsic::amdgcn_interp_inreg_p10_rtz_f16:
+    case Intrinsic::amdgcn_interp_inreg_p2_rtz_f16: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 77063e2b70f66c..0109ebf61305dd 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -173,6 +173,12 @@ defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
 defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16,
                      V_INTERP_P2_F16_F32_inreg, f16,
                      [VINTERPModsHi, VINTERPMods, VINTERPMods]>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_rtz_f16,
+                     V_INTERP_P10_RTZ_F16_F32_inreg, f32,
+                     [VINTERPModsHi, VINTERPMods, VINTERPModsHi]>;
+defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_rtz_f16,
+                     V_INTERP_P2_RTZ_F16_F32_inreg, f16,
+                     [VINTERPModsHi, VINTERPMods, VINTERPMods]>;
 
 //===----------------------------------------------------------------------===//
 // VINTERP Real Instructions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index 623360f6b1d9c5..f3c49b3fdaa34a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -147,6 +147,34 @@ main_body:
   ret half %res
 }
 
+define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_rtz_f16:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s3, exec_lo
+; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GCN-NEXT:    s_mov_b32 exec_lo, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
+  %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0)
+  %h_p0 = call float @llvm.amdgcn.interp.inreg.p10.rtz.f16(float %p0, float %i, float %p0, i1 1)
+  %h_p1 = call half @llvm.amdgcn.interp.inreg.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1)
+  %res = fadd half %l_p1, %h_p1
+  ret half %res
+}
+
 define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
 ; GCN-LABEL: v_interp_f16_imm_params:
 ; GCN:       ; %bb.0: ; %main_body
@@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
 declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
 declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare float @llvm.amdgcn.interp.inreg.p10.rtz.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.inreg.p2.rtz.f16(float, float, float, i1) #0
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index 429528e9091d13..ae4c80e11a1c85 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -147,6 +147,34 @@ main_body:
   ret half %res
 }
 
+define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_rtz_f16:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_mov_b32 s3, exec_lo
+; GCN-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GCN-NEXT:    s_mov_b32 m0, s2
+; GCN-NEXT:    lds_param_load v1, attr0.x wait_vdst:15
+; GCN-NEXT:    s_mov_b32 exec_lo, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v2, s1
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GCN-NEXT:    v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT:    v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT:    v_add_f16_e32 v0, v3, v0
+; GCN-NEXT:    ; return to shader part epilog
+main_body:
+  %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+  %l_p0 = call float @llvm.amdgcn.interp.inreg.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
+  %l_p1 = call half @llvm.amdgcn.interp.inreg.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0)
+  %h_p0 = call float @llvm.amdgcn.interp.inreg.p10.rtz.f16(float %p0, float %i, float %p0, i1 1)
+  %h_p1 = call half @llvm.amdgcn.interp.inreg.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1)
+  %res = fadd half %l_p1, %h_p1
+  ret half %res
+}
+
 define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
 ; GCN-LABEL: v_interp_f16_imm_params:
 ; GCN:       ; %bb.0: ; %main_body
@@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
 declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
 declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare float @llvm.amdgcn.interp.inreg.p10.rtz.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.inreg.p2.rtz.f16(float, float, float, i1) #0
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0

llvm/lib/Target/AMDGPU/VINTERPInstructions.td

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

ruiling requested review from jayfoad and nhaehnle March 26, 2024 01:53

llvmbot added backend:AMDGPU llvm:globalisel llvm:ir labels Mar 26, 2024

ruiling mentioned this pull request Mar 26, 2024

[AMDGPU] Use RTZ for newer fp16 interp instructions #86235

Closed

arsenm reviewed Mar 26, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/VINTERPInstructions.td Show resolved Hide resolved

arsenm reviewed Mar 26, 2024

View reviewed changes

llvm/include/llvm/IR/IntrinsicsAMDGPU.td Outdated Show resolved Hide resolved

ruiling force-pushed the expose-fp16-interp-rtz branch from f2a7275 to f2ff04e Compare March 27, 2024 00:40

arsenm requested a review from piotrAMD March 28, 2024 11:10

[AMDGPU] Expose RTZ version of f16 interpolation for gfx11+

6c267c7

ruiling force-pushed the expose-fp16-interp-rtz branch from f2ff04e to 6c267c7 Compare March 29, 2024 01:17

arsenm approved these changes Mar 29, 2024

View reviewed changes

ruiling merged commit 216b5e9 into llvm:main Apr 1, 2024
4 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Expose RTZ version of f16 interpolation #86614

[AMDGPU] Expose RTZ version of f16 interpolation #86614

ruiling commented Mar 26, 2024

llvmbot commented Mar 26, 2024 •

edited

[AMDGPU] Expose RTZ version of f16 interpolation #86614

[AMDGPU] Expose RTZ version of f16 interpolation #86614

Conversation

ruiling commented Mar 26, 2024

llvmbot commented Mar 26, 2024 • edited

llvmbot commented Mar 26, 2024 •

edited