diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 875278a3b4f97..3d9350e4cf435 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6841,6 +6841,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return splitTernaryVectorOp(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: + if (AMDGPU::isGFX11Plus(*Subtarget) && Op.getValueType() == MVT::i16 && + Op.getOperand(0).getValueType() == MVT::f32) { + // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32. + return Op; + } return LowerFP_TO_INT(Op, DAG); case ISD::SHL: case ISD::SRA: diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 1931e0be15152..936eb545563ae 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -469,6 +469,14 @@ let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE], } // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE] // SchedRW = [WriteSFPU], isReMaterializable = 1 +let SubtargetPredicate = HasSALUFloatInsts, AddedComplexity = 9 in { + // Fallback patterns for f32->i16 conversion. + def : GCNPat<(i16 (UniformUnaryFrag f32:$src0)), + (S_CVT_I32_F32 $src0)>; + def : GCNPat<(i16 (UniformUnaryFrag f32:$src0)), + (S_CVT_U32_F32 $src0)>; +} + let hasSideEffects = 1 in { let has_sdst = 0 in { let Uses = [M0] in { diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 872bde501cd2d..e1b22c6804544 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1721,6 +1721,28 @@ let SubtargetPredicate = isGFX11Plus in { defm V_MINMAX_I32 : VOP3Inst<"v_minmax_i32", VOP3_Profile>; defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile>; defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile>; + + def : GCNPat<(v2i16 (build_vector (i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (i16 (fp_to_sint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))), + (V_CVT_PK_I16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>; + def : GCNPat<(v2i16 (build_vector (i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (i16 (fp_to_uint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))), + (V_CVT_PK_U16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>; + + // Fallback patterns for f32->i16 conversion. These are only required because + // f32->i16 has to be legal so that we can select V_CVT_PK_[IU]16_F32 above. + let True16Predicate = UseRealTrue16Insts in { + def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (EXTRACT_SUBREG (V_CVT_I32_F32_e64 $src0_modifiers, $src0), lo16)>; + def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (EXTRACT_SUBREG (V_CVT_U32_F32_e64 $src0_modifiers, $src0), lo16)>; + } + let True16Predicate = NotUseRealTrue16Insts in { + def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (V_CVT_I32_F32_e64 $src0_modifiers, $src0)>; + def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (V_CVT_U32_F32_e64 $src0_modifiers, $src0)>; + } } // End SubtargetPredicate = isGFX11Plus class VOP3_CVT_SR_FP16_TiedInput_Profile : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile

{ diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 28d7e6916e519..c9b2c8c08b41c 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -35513,55 +35513,24 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { ; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16: -; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX1250TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16: -; GFX1250TRUE16: ; %bb.0: -; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX1250TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31] +; GFX11-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16: -; GFX1250FAKE16: ; %bb.0: -; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 -; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <2 x bfloat> %x to <2 x i16> ret <2 x i16> %op } @@ -35655,61 +35624,27 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16: -; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX1250TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16: -; GFX1250TRUE16: ; %bb.0: -; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX1250TRUE16-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31] +; GFX11-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v2 +; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16: -; GFX1250FAKE16: ; %bb.0: -; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v2 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <3 x bfloat> %x to <3 x i16> ret <3 x i16> %op } @@ -35827,77 +35762,29 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { ; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16: -; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 -; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX1250TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16: -; GFX1250TRUE16: ; %bb.0: -; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX1250TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX1250TRUE16-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX1250TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31] +; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v3 +; GFX11-NEXT: v_cvt_pk_i16_f32 v1, v1, v2 +; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GFX1250FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16: -; GFX1250FAKE16: ; %bb.0: -; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v0 -; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX1250FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 -; GFX1250FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 -; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v3 +; GFX1250-NEXT: v_cvt_pk_i16_f32 v1, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <4 x bfloat> %x to <4 x i16> ret <4 x i16> %op } diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index a2cd6d28e96cb..0af603b4ccf5f 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -1241,27 +1241,16 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in) ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fp_to_sint_f32_i16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v1, s2 -; GFX11-SDAG-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fp_to_sint_f32_i16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_i32_f32_e32 v0, s2 -; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: fp_to_sint_f32_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_i32_f32_e32 v0, s2 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_f32_i16: ; EG: ; %bb.0: @@ -1321,13 +1310,10 @@ define amdgpu_kernel void @fp_to_sint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x ; GFX11-SDAG-LABEL: fp_to_sint_v2f32_to_v2i16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v0, s3 -; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-SDAG-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-SDAG-NEXT: v_cvt_pk_i16_f32 v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fp_to_sint_v2f32_to_v2i16: @@ -1396,13 +1382,10 @@ define amdgpu_kernel void @fp_to_sint_f32_to_v2i16(ptr addrspace(1) %out, float ; GFX11-SDAG-LABEL: fp_to_sint_f32_to_v2i16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v0, s3 -; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-SDAG-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-SDAG-NEXT: v_cvt_pk_i16_f32 v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fp_to_sint_f32_to_v2i16: @@ -1444,5 +1427,3 @@ define amdgpu_kernel void @fp_to_sint_f32_to_v2i16(ptr addrspace(1) %out, float attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index 32f80ff6c22f8..165ba24babf6b 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -1021,27 +1021,16 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %i ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fp_to_uint_f32_to_i16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_clause 0x1 -; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v1, s2 -; GFX11-SDAG-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fp_to_uint_f32_to_i16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_clause 0x1 -; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_u32_f32_e32 v0, s2 -; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: fp_to_uint_f32_to_i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_cvt_u32_f32_e32 v0, s2 +; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_f32_to_i16: ; EG: ; %bb.0: @@ -1099,13 +1088,10 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x ; GFX11-SDAG-LABEL: fp_to_uint_v2f32_to_v2i16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, s3 -; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-SDAG-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-SDAG-NEXT: v_cvt_pk_u16_f32 v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fp_to_uint_v2f32_to_v2i16: @@ -1172,13 +1158,10 @@ define amdgpu_kernel void @fp_to_uint_f32_to_v2i16(ptr addrspace(1) %out, float ; GFX11-SDAG-LABEL: fp_to_uint_f32_to_v2i16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v0, s3 -; GFX11-SDAG-NEXT: v_cvt_u32_f32_e32 v2, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-SDAG-NEXT: global_store_b32 v1, v2, s[0:1] +; GFX11-SDAG-NEXT: v_cvt_pk_u16_f32 v1, s2, s3 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: fp_to_uint_f32_to_v2i16: @@ -1219,5 +1202,3 @@ define amdgpu_kernel void @fp_to_uint_f32_to_v2i16(ptr addrspace(1) %out, float attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11: {{.*}}