From 7761690487314a26fb9838bfdfc69448373ce918 Mon Sep 17 00:00:00 2001 From: Ruiling Song Date: Fri, 22 Mar 2024 10:59:12 +0800 Subject: [PATCH] [AMDGPU] Use RTZ for newer fp16 interp instructions They need to be run under RTZ like their predecessors to work correctly. --- llvm/lib/Target/AMDGPU/SIModeRegister.cpp | 2 ++ llvm/lib/Target/AMDGPU/VINTERPInstructions.td | 2 ++ .../CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll | 6 +++++- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll | 6 +++++- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp index c01b1266a5530..b7e27e4ea3738 100644 --- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp +++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp @@ -170,6 +170,8 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI, case AMDGPU::V_INTERP_P1LL_F16: case AMDGPU::V_INTERP_P1LV_F16: case AMDGPU::V_INTERP_P2_F16: + case AMDGPU::V_INTERP_P10_F16_F32_inreg: + case AMDGPU::V_INTERP_P2_F16_F32_inreg: // f16 interpolation instructions need double precision round to zero return Status(FP_ROUND_MODE_DP(3), FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO)); diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td index 77063e2b70f66..e04d686257ad9 100644 --- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td @@ -110,10 +110,12 @@ let SubtargetPredicate = HasVINTERPEncoding in { let Uses = [M0, EXEC, MODE] in { def V_INTERP_P10_F32_inreg : VINTERP_Pseudo <"v_interp_p10_f32", VOP3_VINTERP_F32>; def V_INTERP_P2_F32_inreg : VINTERP_Pseudo <"v_interp_p2_f32", VOP3_VINTERP_F32>; +let FPDPRounding = 1 in { def V_INTERP_P10_F16_F32_inreg : VINTERP_Pseudo <"v_interp_p10_f16_f32", VOP3_VINTERP_F16<[f32, f32, f32, f32]>>; def V_INTERP_P2_F16_F32_inreg : VINTERP_Pseudo <"v_interp_p2_f16_f32", VOP3_VINTERP_F16<[f16, f32, f32, f32]>>; +} // End FPDPRounding = 1 } // Uses = [M0, EXEC, MODE] let Uses = [M0, EXEC] in { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll index 623360f6b1d9c..b0603c622b80e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -129,12 +129,14 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m ; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 ; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GCN-NEXT: v_add_f16_e32 v0, v3, v0 ; GCN-NEXT: ; return to shader part epilog main_body: @@ -152,9 +154,11 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) # ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GCN-NEXT: v_add_f16_e32 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll index 429528e9091d1..cf99b83e961b7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -129,12 +129,14 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m ; GCN-NEXT: s_mov_b32 exec_lo, s3 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0 ; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GCN-NEXT: v_add_f16_e32 v0, v3, v0 ; GCN-NEXT: ; return to shader part epilog main_body: @@ -152,9 +154,11 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) # ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 3 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7 ; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7 +; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GCN-NEXT: v_add_f16_e32 v0, v1, v0