diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index b6e697d34c3d3..fba5d9de28306 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2201,6 +2201,8 @@ def : GCNPat < } foreach fp16vt = [f16, bf16] in { +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat < (fcopysign fp16vt:$src0, fp16vt:$src1), (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) @@ -2231,6 +2233,42 @@ def : GCNPat < (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) >; +} +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat < + (fcopysign fp16vt:$src0, fp16vt:$src1), + (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16), + (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16) +>; + +def : GCNPat < + (fcopysign f32:$src0, fp16vt:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)) +>; + +def : GCNPat < + (fcopysign f64:$src0, fp16vt:$src1), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16)), sub1) +>; + +def : GCNPat < + (fcopysign fp16vt:$src0, f32:$src1), + (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)), + (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16) +>; + +def : GCNPat < + (fcopysign fp16vt:$src0, f64:$src1), + (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16), + (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) +>; +} } // End foreach fp16vt = [f16, bf16] @@ -3154,6 +3192,11 @@ def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (COPY VSrc_b16:$src) >; + +def : GCNPat < + (i1 (DivergentUnaryFrag i16:$a)), + (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) +>; } let True16Predicate = UseRealTrue16Insts in { @@ -3171,6 +3214,11 @@ def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16) >; + +def : GCNPat < + (i1 (DivergentUnaryFrag i16:$a)), + (V_CMP_EQ_U16_t16_e64 (i32 0), (V_AND_B16_t16_e64 (i32 0), (i16 1), (i32 0), $a), (i32 0), (i16 1), (i32 0)) +>; } def : GCNPat < @@ -3199,11 +3247,6 @@ def : GCNPat < (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) >; -def : GCNPat < - (i1 (DivergentUnaryFrag i16:$a)), - (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) ->; - def IMMBitSelConst : SDNodeXFormgetTargetConstant(1ULL << N->getZExtValue(), SDLoc(N), MVT::i32); @@ -3807,7 +3850,8 @@ def : GCNPat < (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1)) >; - +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat < (v2f16 (scalar_to_vector f16:$src0)), (COPY $src0) @@ -3827,6 +3871,29 @@ def : GCNPat < (v4f16 (scalar_to_vector f16:$src0)), (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) >; +} + +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat < + (v2f16 (scalar_to_vector f16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16) +>; + +def : GCNPat < + (v2i16 (scalar_to_vector i16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16) +>; + +def : GCNPat < + (v4i16 (scalar_to_vector i16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1) +>; + +def : GCNPat < + (v4f16 (scalar_to_vector f16:$src0)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1) +>; +} def : GCNPat < (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 49fe1eed9c514..44c719f3635c8 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -40442,11 +40442,11 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX11TRUE16-LABEL: v_vselect_v2bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.h ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v2.h, s0 @@ -42871,16 +42871,16 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX11TRUE16-LABEL: v_vselect_v4bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l +; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l +; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v3.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v1 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v3 +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v1.h ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v5.l, vcc_lo ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v4.l, s0 @@ -43195,28 +43195,28 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX11TRUE16-LABEL: v_vselect_v8bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l +; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v5.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v4 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v5 -; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v7 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v0 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v1 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v3 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v4 -; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v13.l, v9.l, s4 +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l +; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v6.l +; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v4.l +; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v2.l +; GFX11TRUE16-NEXT: v_and_b16 v2.l, 1, v7.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 1, v1.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v0.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 1, v0.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 1, v1.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 1, v2.l +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v12.l, v8.l, s0 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v15.l, v11.l, s2 ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v14.l, v10.l, s3 -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v12.l, v8.l, s0 +; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v13.l, v9.l, s4 ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v12.h, v8.h, vcc_lo ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v13.h, v9.h, s1 ; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v14.h, v10.h, s5 @@ -43872,38 +43872,38 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v12 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v10 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 +; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l +; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v2.l +; GFX11TRUE16-NEXT: v_and_b16 v2.l, 1, v5.l +; GFX11TRUE16-NEXT: v_and_b16 v2.h, 1, v4.l +; GFX11TRUE16-NEXT: v_and_b16 v3.l, 1, v7.l +; GFX11TRUE16-NEXT: v_and_b16 v3.h, 1, v6.l +; GFX11TRUE16-NEXT: v_and_b16 v4.l, 1, v9.l +; GFX11TRUE16-NEXT: v_and_b16 v4.h, 1, v8.l +; GFX11TRUE16-NEXT: v_and_b16 v5.l, 1, v11.l +; GFX11TRUE16-NEXT: v_and_b16 v5.h, 1, v10.l +; GFX11TRUE16-NEXT: v_and_b16 v6.l, 1, v13.l +; GFX11TRUE16-NEXT: v_and_b16 v6.h, 1, v12.l +; GFX11TRUE16-NEXT: v_and_b16 v7.l, 1, v15.l +; GFX11TRUE16-NEXT: v_and_b16 v7.h, 1, v14.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v1.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 1, v2.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 1, v2.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 1, v3.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 1, v3.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 1, v4.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 1, v4.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 1, v5.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 1, v6.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 1, v6.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 1, v5.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 1, v7.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 1, v7.h ; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v30.l, v22.l, s10 ; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v30.h, v22.h, s11 ; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v29.l, v21.l, s12 @@ -45512,149 +45512,149 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 ; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68 ; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72 -; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:124 -; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:128 -; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:64 -; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:60 -; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:120 -; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:56 -; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:116 -; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:52 -; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:112 -; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:48 -; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:108 -; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:44 -; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:104 -; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:40 -; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:100 -; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:36 -; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:96 -; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:32 -; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:92 -; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:28 -; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:88 -; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:24 -; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:84 -; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:20 -; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:76 +; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76 +; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124 +; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:128 +; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:64 +; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60 +; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:120 +; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:56 +; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:116 +; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:52 +; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:112 +; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:48 +; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:108 +; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:44 +; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:104 +; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:40 +; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:100 +; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:36 +; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:96 +; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:32 +; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:92 +; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:28 +; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:88 +; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:24 +; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:84 +; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:20 ; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:80 ; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:16 ; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:12 ; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:8 ; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4 -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16 -; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11TRUE16-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX11TRUE16-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX11TRUE16-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX11TRUE16-NEXT: v_and_b32_e32 v8, 1, v8 -; GFX11TRUE16-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX11TRUE16-NEXT: v_and_b32_e32 v10, 1, v10 -; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 1, v13 -; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 1, v12 -; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14 -; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 1, v17 -; GFX11TRUE16-NEXT: v_and_b32_e32 v19, 1, v19 -; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18 -; GFX11TRUE16-NEXT: v_and_b32_e32 v21, 1, v21 -; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20 -; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 1, v23 -; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22 -; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 1, v25 -; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24 -; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 1, v27 -; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26 -; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29 -; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28 -; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 1, v16 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v3 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 1, v2 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 1, v5 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 1, v4 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 1, v7 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 1, v6 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 1, v9 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 1, v8 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 1, v11 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 1, v10 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 1, v13 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 1, v12 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v15 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 1, v14 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v17 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v19 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 1, v18 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v21 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 1, v20 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v23 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 1, v22 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 1, v25 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 1, v24 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 1, v27 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v30 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 1, v28 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 1, v29 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v26 +; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l +; GFX11TRUE16-NEXT: v_and_b16 v1.h, 1, v2.l +; GFX11TRUE16-NEXT: v_and_b16 v2.l, 1, v5.l +; GFX11TRUE16-NEXT: v_and_b16 v2.h, 1, v4.l +; GFX11TRUE16-NEXT: v_and_b16 v3.l, 1, v7.l +; GFX11TRUE16-NEXT: v_and_b16 v3.h, 1, v6.l +; GFX11TRUE16-NEXT: v_and_b16 v4.l, 1, v9.l +; GFX11TRUE16-NEXT: v_and_b16 v4.h, 1, v8.l +; GFX11TRUE16-NEXT: v_and_b16 v5.l, 1, v11.l +; GFX11TRUE16-NEXT: v_and_b16 v5.h, 1, v10.l +; GFX11TRUE16-NEXT: v_and_b16 v6.l, 1, v13.l +; GFX11TRUE16-NEXT: v_and_b16 v6.h, 1, v12.l +; GFX11TRUE16-NEXT: v_and_b16 v7.l, 1, v15.l +; GFX11TRUE16-NEXT: v_and_b16 v7.h, 1, v14.l +; GFX11TRUE16-NEXT: v_and_b16 v8.l, 1, v17.l +; GFX11TRUE16-NEXT: v_and_b16 v8.h, 1, v16.l +; GFX11TRUE16-NEXT: v_and_b16 v9.l, 1, v19.l +; GFX11TRUE16-NEXT: v_and_b16 v9.h, 1, v18.l +; GFX11TRUE16-NEXT: v_and_b16 v10.l, 1, v21.l +; GFX11TRUE16-NEXT: v_and_b16 v10.h, 1, v20.l +; GFX11TRUE16-NEXT: v_and_b16 v11.l, 1, v23.l +; GFX11TRUE16-NEXT: v_and_b16 v11.h, 1, v22.l +; GFX11TRUE16-NEXT: v_and_b16 v12.l, 1, v25.l +; GFX11TRUE16-NEXT: v_and_b16 v12.h, 1, v24.l +; GFX11TRUE16-NEXT: v_and_b16 v13.l, 1, v27.l +; GFX11TRUE16-NEXT: v_and_b16 v13.h, 1, v26.l +; GFX11TRUE16-NEXT: v_and_b16 v14.l, 1, v29.l +; GFX11TRUE16-NEXT: v_and_b16 v14.h, 1, v28.l +; GFX11TRUE16-NEXT: v_and_b16 v15.l, 1, v30.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v1.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 1, v1.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 1, v2.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 1, v2.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 1, v3.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 1, v3.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 1, v4.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 1, v4.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 1, v5.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 1, v5.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 1, v6.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 1, v6.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 1, v7.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 1, v7.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 1, v8.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 1, v8.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 1, v9.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 1, v9.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 1, v10.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 1, v10.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 1, v11.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 1, v11.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 1, v12.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 1, v12.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 1, v13.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 1, v15.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 1, v14.h +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 1, v14.l +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 1, v13.h ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v31 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v35.l, v36.l, s26 +; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v31.l ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26) -; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v34.l, v37.l, s27 -; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v34.h, v37.h, s28 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24) -; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v38.l, v39.l, s29 -; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v38.h, v39.h, s25 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22) -; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v48.l, v49.l, s24 -; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v48.h, v49.h, s23 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v50.l, v51.l, s22 -; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v50.h, v51.h, s21 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v52.l, v53.l, s20 -; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v52.h, v53.h, s19 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v54.l, v55.l, s18 -; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v54.h, v55.h, s17 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v64.l, v65.l, s16 -; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v64.h, v65.h, s15 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v66.l, v67.l, s14 -; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v66.h, v67.h, s13 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10) -; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v68.l, v69.l, s12 -; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v68.h, v69.h, s11 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v70.l, v71.l, s10 -; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v70.h, v71.h, s9 -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v80.l, v81.l, s8 -; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v80.h, v81.h, s7 +; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v36.l, v37.l, s26 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25) +; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v38.l, s27 +; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v35.h, v38.h, s28 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v39.l, v48.l, s29 +; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v39.h, v48.h, s25 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v49.l, v50.l, s24 +; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v49.h, v50.h, s23 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v51.l, v52.l, s22 +; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v51.h, v52.h, s21 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17) +; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v53.l, v54.l, s20 +; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v53.h, v54.h, s19 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(15) +; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v55.l, v64.l, s18 +; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v55.h, v64.h, s17 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v65.l, v66.l, s16 +; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v65.h, v66.h, s15 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v67.l, v68.l, s14 +; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v67.h, v68.h, s13 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v69.l, v70.l, s12 +; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v69.h, v70.h, s11 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v71.l, v80.l, s10 +; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v71.h, v80.h, s9 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v81.l, v82.l, s8 +; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v81.h, v82.h, s7 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3) ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v83.l, v84.l, s6 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v82.l, v85.l, s4 +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v34.l, v85.l, s4 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v33.l, v86.l, s2 ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v87.l, s0 -; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 +; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.h ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v32.h, v87.h, vcc_lo ; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v33.h, v86.h, s1 -; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v82.h, v85.h, s3 +; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v34.h, v85.h, s3 ; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v83.h, v84.h, s5 -; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v35.h, v36.h, s0 +; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v36.h, v37.h, s0 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_vselect_v32bf16: diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index d374ed072cdc6..fc3c476d0ab2e 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -689,9 +689,10 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: scratch_store_b16 off, v0, off offset:4 dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v0, off, off offset:2 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v3, off, off offset:2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v0, off, off ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 1a3c8febea865..6a898fa799f3e 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -106,11 +106,19 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s16 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_bf16_s_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_bf16_s_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_bf16_s_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) ret bfloat %op } @@ -160,11 +168,19 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s16, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_s_bf16_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_s_bf16_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_s_bf16_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) ret bfloat %op } @@ -215,13 +231,22 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_bf16_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_bf16_f32: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_bf16_f32: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %sign = fptrunc float %sign.f32 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) ret bfloat %op @@ -273,13 +298,22 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_bf16_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_bf16_f64: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_bf16_f64: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %sign = fptrunc double %sign.f64 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) ret bfloat %op @@ -387,15 +421,26 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_bf16_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_bf16_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_bf16_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_mov_b32_e32 v0, s1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) %cast = bitcast bfloat %op to i16 %zext = zext i16 %cast to i32 @@ -452,15 +497,25 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_bf16_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_bf16_f32: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_bf16_f32: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_lshrrev_b32_e64 v0, 16, s1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign = fptrunc float %sign.f32 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) %cast = bitcast bfloat %op to i16 @@ -518,15 +573,25 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign. ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_bf16_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_bf16_f64: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s2 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_bf16_f64: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_lshrrev_b32_e64 v0, 16, s2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign = fptrunc double %sign.f64 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) %cast = bitcast bfloat %op to i16 @@ -584,15 +649,26 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_bf16_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_bf16_f16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_bf16_f16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_mov_b32_e32 v0, s1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign = bitcast half %sign.f16 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) %cast = bitcast bfloat %op to i16 @@ -641,13 +717,21 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_f32_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_f32_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_f32_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %sign = fpext bfloat %sign.bf16 to float %op = call float @llvm.copysign.f32(float %mag, float %sign) ret float %op @@ -697,13 +781,21 @@ define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.b ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_f32_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_f32_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_f32_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign = fpext bfloat %sign.bf16 to float %op = call float @llvm.copysign.f32(float %mag, float %sign) %cast = bitcast float %op to i32 @@ -823,15 +915,26 @@ define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_f16_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_f16_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_f16_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_mov_b32_e32 v0, s1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign = bitcast bfloat %sign.bf16 to half %op = call half @llvm.copysign.f16(half %mag, half %sign) %cast = bitcast half %op to i16 @@ -880,13 +983,21 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) { ; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_f64_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_copysign_f64_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_copysign_f64_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %sign = fpext bfloat %sign.bf16 to double %op = call double @llvm.copysign.f64(double %mag, double %sign) ret double %op @@ -936,13 +1047,21 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg ; GFX10-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_f64_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_f64_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_f64_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign = fpext bfloat %sign.bf16 to double %op = call double @llvm.copysign.f64(double %mag, double %sign) %cast = bitcast double %op to <2 x i32> @@ -3424,13 +3543,21 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_bf16(float inreg %mag, bfl ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_out_f32_mag_f32_sign_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign.ext = fpext bfloat %sign to float %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) %cast = bitcast float %out to i32 @@ -3481,13 +3608,21 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_bf16(double inreg %m ; GFX10-NEXT: v_readfirstlane_b32 s1, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_out_f64_mag_f64_sign_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign.ext = fpext bfloat %sign to double %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) %cast = bitcast double %out to <2 x i32> @@ -3540,13 +3675,21 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, f ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s1 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_lshrrev_b32_e64 v0, 16, s1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign.trunc = fptrunc float %sign to bfloat %out = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign.trunc) %cast = bitcast bfloat %out to i16 @@ -3599,13 +3742,21 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f64(bfloat inreg %mag, d ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s2 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_lshrrev_b32_e64 v0, 16, s2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign.trunc = fptrunc double %sign to bfloat %out = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign.trunc) %cast = bitcast bfloat %out to i16 @@ -3682,23 +3833,41 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_f32_sign_bf16(float inreg %mag, bf ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_bfe_u32 s2, s0, 0x10010 -; GFX11-NEXT: v_cmp_u_f32_e64 s3, s0, s0 -; GFX11-NEXT: s_add_i32 s2, s2, s0 -; GFX11-NEXT: s_bitset1_b32 s0, 22 -; GFX11-NEXT: s_addk_i32 s2, 0x7fff -; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: s_and_b32 s3, s3, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s0, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_bfe_u32 s2, s0, 0x10010 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e64 s3, s0, s0 +; GFX11TRUE16-NEXT: s_add_i32 s2, s2, s0 +; GFX11TRUE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11TRUE16-NEXT: s_addk_i32 s2, 0x7fff +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 +; GFX11TRUE16-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11TRUE16-NEXT: s_cselect_b32 s0, s0, s2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11TRUE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_bfe_u32 s2, s0, 0x10010 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e64 s3, s0, s0 +; GFX11FAKE16-NEXT: s_add_i32 s2, s2, s0 +; GFX11FAKE16-NEXT: s_bitset1_b32 s0, 22 +; GFX11FAKE16-NEXT: s_addk_i32 s2, 0x7fff +; GFX11FAKE16-NEXT: v_mov_b32_e32 v0, s1 +; GFX11FAKE16-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11FAKE16-NEXT: s_cselect_b32 s0, s0, s2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11FAKE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: ; return to shader part epilog %mag.trunc = fptrunc float %mag to bfloat %out = call bfloat @llvm.copysign.bf16(bfloat %mag.trunc, bfloat %sign) %cast = bitcast bfloat %out to i16 @@ -3829,14 +3998,10 @@ define <2 x float> @v_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x float> %mag, ; GFX11TRUE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2bf16: @@ -3907,14 +4072,10 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> %ma ; GFX11TRUE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v4 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16: @@ -4700,18 +4861,31 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x floa ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2 -; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s2 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e64 v1, 16, s3 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign.ext = fpext <2 x bfloat> %sign to <2 x float> %out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign.ext) %cast = bitcast <2 x float> %out to <2 x i32> @@ -4784,19 +4958,32 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x doub ; GFX10-NEXT: v_readfirstlane_b32 s3, v1 ; GFX10-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; GFX11-NEXT: s_lshr_b32 s4, s4, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11TRUE16-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_lshr_b32 s5, s4, 16 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s4 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, s5 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11TRUE16-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11TRUE16-NEXT: ; return to shader part epilog +; +; GFX11FAKE16-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s4 +; GFX11FAKE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e64 v1, 16, s4 +; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11FAKE16-NEXT: ; return to shader part epilog %sign.ext = fpext <2 x bfloat> %sign to <2 x double> %out = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign.ext) %cast = bitcast <2 x double> %out to <4 x i32> @@ -5576,17 +5763,12 @@ define <3 x float> @v_copysign_out_v3f32_mag_v3f32_sign_v3bf16(<3 x float> %mag, ; GFX11TRUE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v4 -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v5 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4 +; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3bf16: @@ -5667,17 +5849,12 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3bf16(<3 x double> %ma ; GFX11TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7 +; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v8 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16: @@ -6523,19 +6700,15 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64(<3 x bfloat> %m ; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v2 -; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v0, v5 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v4 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff0000, v1, v3 +; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff0000, v2, v7 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: @@ -6733,22 +6906,13 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4bf16(<4 x float> %mag, ; GFX11TRUE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v6 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 +; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v6 ; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7 -; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4bf16: @@ -6844,22 +7008,13 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4bf16(<4 x double> %ma ; GFX11TRUE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.h, v8.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l +; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 +; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v9 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v10 ; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 -; GFX11TRUE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v9 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index ba4fe3685458d..574c1042859aa 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -43,13 +43,22 @@ define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %out = call half @llvm.copysign.f16(half %mag, half %sign) %cast = bitcast half %out to i16 ret i16 %cast @@ -677,13 +686,21 @@ define float @v_copysign_out_f32_mag_f32_sign_f16(float %mag, half %sign) { ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_copysign_out_f32_mag_f32_sign_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_copysign_out_f32_mag_f32_sign_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %sign.ext = fpext half %sign to float %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) ret float %out @@ -713,13 +730,21 @@ define double @v_copysign_out_f64_mag_f64_sign_f16(double %mag, half %sign) { ; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_copysign_out_f64_mag_f64_sign_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_copysign_out_f64_mag_f64_sign_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %sign.ext = fpext half %sign to double %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) ret double %out @@ -751,13 +776,22 @@ define half @v_copysign_out_f16_mag_f16_sign_f32(half %mag, float %sign) { ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f16_sign_f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f16_sign_f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %sign.trunc = fptrunc float %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) ret half %out @@ -789,13 +823,22 @@ define half @v_copysign_out_f16_mag_f16_sign_f64(half %mag, double %sign) { ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f16_sign_f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f16_sign_f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %sign.trunc = fptrunc double %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) ret half %out @@ -988,52 +1031,100 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) { ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_copysign_out_f16_mag_f64_sign_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; GFX11-NEXT: v_bfe_u32 v1, v1, 20, 11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v1 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xfffffc10, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v3, v0 -; GFX11-NEXT: v_med3_i32 v3, v4, 0, 13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v5 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v4 -; GFX11-NEXT: v_lshl_or_b32 v4, v1, 12, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v3, v5, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v4, 7, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: v_dual_mov_b32 v4, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f64_sign_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v1, 20, 11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffffc10, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v3, v0 +; GFX11-TRUE16-NEXT: v_med3_i32 v3, v4, 0, 13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x1000, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v3, v5 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 12, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 7, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0x7e00 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v5 :: v_dual_add_nc_u32 v3, v3, v4 +; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f64_sign_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v1, 20, 11 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v1 +; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffffc10, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v3, v0 +; GFX11-FAKE16-NEXT: v_med3_i32 v3, v4, 0, 13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x1000, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v3, v5 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v4 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 12, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 7, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 2, v3 +; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v4, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %mag.trunc = fptrunc double %mag to half %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) ret half %result @@ -1188,59 +1279,115 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b32 s3, s1, 0x1ff -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s0, s3, s0 -; GFX11-NEXT: s_cmp_lg_u32 s0, 0 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: s_bfe_u32 s0, s1, 0xb0014 -; GFX11-NEXT: s_lshr_b32 s1, s1, 8 -; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s0 -; GFX11-NEXT: s_and_b32 s1, s1, 0xffe -; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13 -; GFX11-NEXT: v_readfirstlane_b32 s3, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s4, v1 -; GFX11-NEXT: s_or_b32 s1, s1, s3 -; GFX11-NEXT: s_or_b32 s3, s1, 0x1000 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b32 s5, s3, s4 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u32 s4, s3 -; GFX11-NEXT: s_cselect_b32 s3, 1, 0 -; GFX11-NEXT: s_addk_i32 s0, 0xfc10 -; GFX11-NEXT: s_or_b32 s3, s5, s3 -; GFX11-NEXT: s_lshl_b32 s4, s0, 12 -; GFX11-NEXT: s_or_b32 s4, s1, s4 -; GFX11-NEXT: s_cmp_lt_i32 s0, 1 -; GFX11-NEXT: s_cselect_b32 s3, s3, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s4, s3, 7 -; GFX11-NEXT: s_cmp_gt_i32 s4, 5 -; GFX11-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 3 -; GFX11-NEXT: s_cselect_b32 s4, 1, 0 -; GFX11-NEXT: s_lshr_b32 s3, s3, 2 -; GFX11-NEXT: s_or_b32 s4, s4, s5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s3, s3, s4 -; GFX11-NEXT: s_cmp_lt_i32 s0, 31 -; GFX11-NEXT: s_movk_i32 s4, 0x7e00 -; GFX11-NEXT: s_cselect_b32 s3, s3, 0x7c00 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s1, s4, 0x7c00 -; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x40f -; GFX11-NEXT: s_cselect_b32 s0, s1, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 +; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8 +; GFX11-TRUE16-NEXT: s_sub_i32 s3, 0x3f1, s0 +; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffe +; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s1, 0x1000 +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s4, s3 +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0xfc10 +; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s3 +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s0, 12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_or_b32 s4, s1, s4 +; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s0, 1 +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s4, 5 +; GFX11-TRUE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 3 +; GFX11-TRUE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, s4 +; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s0, 31 +; GFX11-TRUE16-NEXT: s_movk_i32 s4, 0x7e00 +; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s4, 0x7c00 +; GFX11-TRUE16-NEXT: s_cmpk_eq_i32 s0, 0x40f +; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s1, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014 +; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8 +; GFX11-FAKE16-NEXT: s_sub_i32 s3, 0x3f1, s0 +; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffe +; GFX11-FAKE16-NEXT: v_med3_i32 v1, s3, 0, 13 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s3 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s1, 0x1000 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s3, s4 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s4, s3 +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, 1, 0 +; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0xfc10 +; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s3 +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s0, 12 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s1, s4 +; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s0, 1 +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 7 +; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s4, 5 +; GFX11-FAKE16-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 3 +; GFX11-FAKE16-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 2 +; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, s4 +; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s0, 31 +; GFX11-FAKE16-NEXT: s_movk_i32 s4, 0x7e00 +; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s4, 0x7c00 +; GFX11-FAKE16-NEXT: s_cmpk_eq_i32 s0, 0x40f +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s1, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %mag.trunc = fptrunc double %mag to half %result = call half @llvm.copysign.f16(half %mag.trunc, half %sign) %cast = bitcast half %result to i16 @@ -2583,13 +2730,21 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_f16(float inreg %mag, half ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_f32_mag_f32_sign_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_out_f32_mag_f32_sign_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_out_f32_mag_f32_sign_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %sign.ext = fpext half %sign to float %out = call float @llvm.copysign.f32(float %mag, float %sign.ext) %cast = bitcast float %out to i32 @@ -2624,13 +2779,21 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_f16(double inreg %ma ; GFX9-NEXT: v_readfirstlane_b32 s1, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_f64_mag_f64_sign_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_out_f64_mag_f64_sign_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_out_f64_mag_f64_sign_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %sign.ext = fpext half %sign to double %out = call double @llvm.copysign.f64(double %mag, double %sign.ext) %cast = bitcast double %out to <2 x i32> @@ -2667,13 +2830,21 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_f16_mag_f16_sign_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f16_sign_f32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f16_sign_f32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e64 v0, 16, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %sign.trunc = fptrunc float %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) %cast = bitcast half %out to i16 @@ -2710,13 +2881,21 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f64(half inreg %mag, doubl ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_f16_mag_f16_sign_f64: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f16_sign_f64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s2 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f16_sign_f64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e64 v0, 16, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %sign.trunc = fptrunc double %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) %cast = bitcast half %out to i16 @@ -2756,8 +2935,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f32_sign_f16(float inreg %mag, half ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f32_sign_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -2871,14 +3051,10 @@ define <2 x float> @v_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float> %mag, ; GFX11-TRUE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2f16: @@ -2931,14 +3107,10 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> %mag ; GFX11-TRUE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16: @@ -3683,18 +3855,31 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s2 -; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s2 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v1, 16, s3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0 +; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %sign.ext = fpext <2 x half> %sign to <2 x float> %out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign.ext) %cast = bitcast <2 x float> %out to <2 x i32> @@ -3743,19 +3928,32 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x doubl ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4 -; GFX11-NEXT: s_lshr_b32 s4, s4, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s3, v1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s4, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s4 +; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s4, 16 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v1, 16, s4 +; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %sign.ext = fpext <2 x half> %sign to <2 x double> %out = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign.ext) %cast = bitcast <2 x double> %out to <4 x i32> @@ -4473,17 +4671,12 @@ define <3 x float> @v_copysign_out_v3f32_mag_v3f32_sign_v3f16(<3 x float> %mag, ; GFX11-TRUE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v4 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16: @@ -4543,17 +4736,12 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag ; GFX11-TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7 +; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16: @@ -5402,19 +5590,15 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f64(<3 x half> %mag, <3 ; GFX11-TRUE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v3.l +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff0000, v1, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff0000, v2, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: @@ -5574,22 +5758,13 @@ define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4f16(<4 x float> %mag, ; GFX11-TRUE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v6 ; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16: @@ -5660,22 +5835,13 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag ; GFX11-TRUE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v9 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v10 ; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v9 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index 61f5b73033f5e..0a2e758f7cf21 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -1001,7 +1001,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 +; GFX11-NEXT: v_mov_b16_e32 v0.h, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 +; GFX11-NEXT: v_mov_b16_e32 v0.h, s3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll index 5f75a2f29a026..8b5c34d97e50e 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -374,7 +374,7 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i ; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s6 +; GFX11-NEXT: v_mov_b16_e32 v0.h, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -972,8 +972,9 @@ define double @v_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double ; GFX11-LABEL: v_test_copysign_f64_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, v10 :: v_dual_lshlrev_b32 v1, 16, v20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b16_e32 v1.h, v20.l +; GFX11-NEXT: v_mov_b32_e32 v0, v10 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v11, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %sign.ext = fpext half %sign to double diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll new file mode 100644 index 0000000000000..2b055c42c57c0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.gfx11plus.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16 + +; Make sure no "vgpr32 = copy vgpr16" is generated in true16 mode + +define amdgpu_kernel void @f_copy_sign (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %out) { + ; GFX11-REAL16-LABEL: name: f_copy_sign + ; GFX11-REAL16: bb.0.entry: + ; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 + ; GFX11-REAL16-NEXT: {{ $}} + ; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4) + ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset + 16, align 4, addrspace 4) + ; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3 + ; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2 + ; GFX11-REAL16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1 + ; GFX11-REAL16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 + ; GFX11-REAL16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec + ; GFX11-REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GFX11-REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec + ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep1, addrspace 1) + ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_1:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep2, addrspace 1) + ; GFX11-REAL16-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_1]], %subreg.lo16, [[DEF]], %subreg.hi16 + ; GFX11-REAL16-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-REAL16-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF2]], %subreg.hi16 + ; GFX11-REAL16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; GFX11-REAL16-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 killed [[S_MOV_B32_2]], killed [[REG_SEQUENCE4]], killed [[REG_SEQUENCE3]], implicit $exec + ; GFX11-REAL16-NEXT: [[COPY8:%[0-9]+]]:vgpr_16 = COPY [[V_BFI_B32_e64_]].lo16 + ; GFX11-REAL16-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 killed [[V_MOV_B32_e32_]], killed [[COPY8]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s16) into %ir.3, addrspace 1) + ; GFX11-REAL16-NEXT: S_ENDPGM 0 + ; + ; GFX11-FAKE16-LABEL: name: f_copy_sign + ; GFX11-FAKE16: bb.0.entry: + ; GFX11-FAKE16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s128) from %ir.a.kernarg.offset, align 4, addrspace 4) + ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset + 16, align 4, addrspace 4) + ; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX11-FAKE16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GFX11-FAKE16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub3 + ; GFX11-FAKE16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub2 + ; GFX11-FAKE16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1 + ; GFX11-FAKE16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 + ; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GFX11-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec + ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep1, addrspace 1) + ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_USHORT_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep2, addrspace 1) + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; GFX11-FAKE16-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 killed [[S_MOV_B32_2]], killed [[GLOBAL_LOAD_USHORT_SADDR]], killed [[GLOBAL_LOAD_USHORT_SADDR1]], implicit $exec + ; GFX11-FAKE16-NEXT: GLOBAL_STORE_SHORT_SADDR killed [[V_MOV_B32_e32_]], killed [[V_BFI_B32_e64_]], killed [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (store (s16) into %ir.3, addrspace 1) + ; GFX11-FAKE16-NEXT: S_ENDPGM 0 +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep1 = getelementptr i32, ptr addrspace(1) %a, i32 %tid + %in.gep2 = getelementptr i32, ptr addrspace(1) %b, i32 %tid + %mag = load half, ptr addrspace(1) %in.gep1 + %sign = load half, ptr addrspace(1) %in.gep2 + %f16 = call half @llvm.copysign.f16(half %mag, half %sign) + store half %f16, ptr addrspace(1) %out + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll index 6bb7cdd40a360..0a4edd142d8db 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-offset-bug.ll @@ -166,7 +166,7 @@ define amdgpu_kernel void @load_i16_hi(ptr %arg, ptr %out) { ; GFX11-LABEL: load_i16_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_d16_hi_b16 v2, v[0:1] offset:8 @@ -272,7 +272,7 @@ define amdgpu_kernel void @load_half_hi(ptr %arg, ptr %out) { ; GFX11-LABEL: load_half_hi: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_load_d16_hi_b16 v2, v[0:1] offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll index 47910f5280bfc..04b036cafd81f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll @@ -263,7 +263,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %i ; ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_v: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b16_e32 v3.l, -1 ; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 @@ -301,7 +302,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %i ; ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_v: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0 +; GFX11-NEXT: v_mov_b16_e32 v3.l, -1 ; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 @@ -339,7 +341,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, p ; ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_v: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 2, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b16_e32 v3.l, -1 ; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b32 v[1:2], v3 @@ -696,7 +699,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in ; ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_mov_b32_e32 v2, -1 +; GFX11-NEXT: v_mov_b16_e32 v2.l, -1 ; GFX11-NEXT: s_add_i32 s0, s0, 1 ; GFX11-NEXT: scratch_load_d16_hi_u8 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -735,7 +738,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in ; ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_mov_b32_e32 v2, -1 +; GFX11-NEXT: v_mov_b16_e32 v2.l, -1 ; GFX11-NEXT: s_add_i32 s0, s0, 1 ; GFX11-NEXT: scratch_load_d16_hi_i8 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -774,7 +777,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg ; ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_mov_b32_e32 v2, -1 +; GFX11-NEXT: v_mov_b16_e32 v2.l, -1 ; GFX11-NEXT: s_add_i32 s0, s0, 2 ; GFX11-NEXT: scratch_load_d16_hi_b16 v2, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1165,8 +1168,9 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5) ; ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b16_e32 v3.l, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 ; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1208,8 +1212,9 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5) ; ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b16_e32 v3.l, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1 ; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1251,8 +1256,9 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre ; ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_svs: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b16_e32 v3.l, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2 ; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll index 5674ae328406d..db08cb132a3d7 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -38,17 +38,29 @@ define <2 x i32> @fneg_xor_select_v2i32(<2 x i1> %cond, <2 x i32> %arg0, <2 x i3 ; GCN-NEXT: v_cndmask_b32_e64 v1, -v5, -v3, vcc ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fneg_xor_select_v2i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, -v4, -v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, -v5, -v3, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: fneg_xor_select_v2i32: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, -v4, -v2, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, -v5, -v3, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_xor_select_v2i32: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, -v4, -v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, -v5, -v3, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %select = select <2 x i1> %cond, <2 x i32> %arg0, <2 x i32> %arg1 %fneg = xor <2 x i32> %select, ret <2 x i32> %fneg @@ -131,19 +143,34 @@ define <2 x i64> @fneg_xor_select_v2i64(<2 x i1> %cond, <2 x i64> %arg0, <2 x i6 ; GCN-NEXT: v_cndmask_b32_e64 v3, -v9, -v5, s[4:5] ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: fneg_xor_select_v2i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v6, v2 :: v_dual_and_b32 v1, 1, v1 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, -v7, -v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, v4, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, -v9, -v5, s0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: fneg_xor_select_v2i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, v8, v4, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, -v7, -v3, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, -v9, -v5, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: fneg_xor_select_v2i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, v6, v2 :: v_dual_and_b32 v1, 1, v1 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, -v7, -v3, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v8, v4, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, -v9, -v5, s0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %select = select <2 x i1> %cond, <2 x i64> %arg0, <2 x i64> %arg1 %fneg = xor <2 x i64> %select, ret <2 x i64> %fneg @@ -218,11 +245,11 @@ define <2 x i16> @fneg_xor_select_v2i16(<2 x i1> %cond, <2 x i16> %arg0, <2 x i1 ; GFX11-TRUE16-LABEL: fneg_xor_select_v2i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.h ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v2.h, s0 @@ -755,24 +782,24 @@ define <2 x half> @select_fneg_select_v2f16(<2 x i1> %cond0, <2 x i1> %cond1, <2 ; GFX11-TRUE16-LABEL: select_fneg_select_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v5.h, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v5.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.h +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v2.l, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: select_fneg_select_v2f16: @@ -861,24 +888,24 @@ define <2 x i16> @select_fneg_xor_select_v2i16(<2 x i1> %cond0, <2 x i1> %cond1, ; GFX11-TRUE16-LABEL: select_fneg_xor_select_v2i16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-TRUE16-NEXT: v_xor_b32_e32 v4, 0x80008000, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v5.h, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v5.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.h +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v2.l, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: select_fneg_xor_select_v2i16: diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 20009aee6e7ff..c4a38dcd7b5f3 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -538,9 +538,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v3, v2 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB0_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v5, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.l, v4.l, vcc_lo ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_3 ; GFX11-TRUE16-NEXT: s_branch .LBB0_8 @@ -620,9 +622,10 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v2, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v2, v3 ; GFX11-TRUE16-NEXT: .LBB0_8: ; %Flow19 ; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v0.l ; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v1.l| @@ -767,11 +770,13 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s1, s0 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s1, s0 -; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 -; GFX1150-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1150-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, s3 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v2 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, s2 ; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB0_3 ; GFX1150-TRUE16-NEXT: s_branch .LBB0_8 ; GFX1150-TRUE16-NEXT: .LBB0_2: @@ -851,11 +856,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 ; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v3 ; GFX1150-TRUE16-NEXT: .LBB0_8: ; %Flow19 ; GFX1150-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x7fff, v0 ; GFX1150-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l @@ -1009,11 +1015,13 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s1, s0 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB0_2 ; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s1, s0 -; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 -; GFX1200-TRUE16-NEXT: s_cselect_b32 s3, -1, 0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1200-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, s3 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v2 +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, s2 ; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB0_3 ; GFX1200-TRUE16-NEXT: s_branch .LBB0_8 ; GFX1200-TRUE16-NEXT: .LBB0_2: @@ -1096,11 +1104,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1 ; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v3 ; GFX1200-TRUE16-NEXT: .LBB0_8: ; %Flow19 ; GFX1200-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x7fff, v0 ; GFX1200-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l @@ -5786,9 +5795,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v5, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.l, v2.l, vcc_lo ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_3 ; GFX11-TRUE16-NEXT: s_branch .LBB9_8 @@ -5868,36 +5879,39 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v3 ; GFX11-TRUE16-NEXT: .LBB9_8: ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v7, |v3.l| +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v6, |v3.l| ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v5, |v4.l| ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v7, v5 +; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_10 ; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0x7fff, 0, v3 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v7, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v3.l, v6.l, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v8, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v7.l, vcc_lo ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB9_11 ; GFX11-TRUE16-NEXT: s_branch .LBB9_16 ; GFX11-TRUE16-NEXT: .LBB9_10: -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7 ; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute19 -; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v7 -; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v6 +; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v8 ; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v6, 11 ; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v5 ; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v5 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s2, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_ldexp_f32 v6, v6, 1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v5 ; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, -1, v5 @@ -5963,9 +5977,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v5, v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v5.l, v5 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0x7fff, v5, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v5, v6 ; GFX11-TRUE16-NEXT: .LBB9_16: ; %Flow54 ; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v1.l ; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l| @@ -5976,7 +5991,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v6.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v7.l, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] @@ -6218,11 +6233,13 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 -; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 -; GFX1150-TRUE16-NEXT: s_cselect_b32 s8, -1, 0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s8 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s7 ; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB9_3 ; GFX1150-TRUE16-NEXT: s_branch .LBB9_8 ; GFX1150-TRUE16-NEXT: .LBB9_2: @@ -6302,11 +6319,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 ; GFX1150-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX1150-TRUE16-NEXT: .LBB9_8: ; GFX1150-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 ; GFX1150-TRUE16-NEXT: s_lshr_b32 s5, s3, 16 @@ -6318,11 +6336,13 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 ; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 -; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 -; GFX1150-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, s6, v1.l, s10 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v1 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, s6, v1.l, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB9_11 ; GFX1150-TRUE16-NEXT: s_branch .LBB9_16 ; GFX1150-TRUE16-NEXT: .LBB9_10: @@ -6402,11 +6422,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 ; GFX1150-TRUE16-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s6 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v2 ; GFX1150-TRUE16-NEXT: .LBB9_16: ; %Flow54 ; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s3, 0 ; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 @@ -6682,11 +6703,13 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2 ; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s6, s5 -; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4 -; GFX1200-TRUE16-NEXT: s_cselect_b32 s8, -1, 0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s8 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, v0.l, s7 ; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB9_3 ; GFX1200-TRUE16-NEXT: s_branch .LBB9_8 ; GFX1200-TRUE16-NEXT: .LBB9_2: @@ -6769,11 +6792,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX1200-TRUE16-NEXT: .LBB9_8: ; GFX1200-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 ; GFX1200-TRUE16-NEXT: s_lshr_b32 s5, s3, 16 @@ -6788,11 +6812,13 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10 ; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s7 -; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6 -; GFX1200-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, s6, v1.l, s10 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v1 +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, s6, v1.l, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB9_11 ; GFX1200-TRUE16-NEXT: s_branch .LBB9_16 ; GFX1200-TRUE16-NEXT: .LBB9_10: @@ -6876,11 +6902,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s6 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s6 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v2 ; GFX1200-TRUE16-NEXT: .LBB9_16: ; %Flow54 ; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s3, 0 ; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 @@ -8949,9 +8976,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v7, v4 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.l, v4.l, vcc_lo ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_3 ; GFX11-TRUE16-NEXT: s_branch .LBB10_8 @@ -9031,9 +9060,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v4.l, v4 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v5 ; GFX11-TRUE16-NEXT: .LBB10_8: ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 @@ -9044,9 +9074,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_10 ; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, 0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v10, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v7.l, vcc_lo ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_11 ; GFX11-TRUE16-NEXT: s_branch .LBB10_16 @@ -9126,9 +9158,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v8, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v7.l, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v7, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fff, v7, v8 ; GFX11-TRUE16-NEXT: .LBB10_16: ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v10, |v1.l| ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v9, |v3.l| @@ -9136,9 +9169,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_18 ; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else53 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, 0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v11, v8 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.l, v8.l, vcc_lo ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_19 ; GFX11-TRUE16-NEXT: s_branch .LBB10_24 @@ -9218,9 +9253,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v8, v9, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v8.l, v8 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0x7fff, v8, v9 ; GFX11-TRUE16-NEXT: .LBB10_24: ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v3 @@ -9231,9 +9267,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_26 ; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else86 -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, 0, v9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v14, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v9.l, v11.l, vcc_lo ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_27 ; GFX11-TRUE16-NEXT: s_branch .LBB10_32 @@ -9313,9 +9351,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_ldexp_f32 v11, v12, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v11.l, v11 -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0x7fff, v11, v12 ; GFX11-TRUE16-NEXT: .LBB10_32: ; %Flow124 ; GFX11-TRUE16-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0, v2.l ; GFX11-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l| @@ -9778,11 +9817,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 -; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 -; GFX1150-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s10 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_3 ; GFX1150-TRUE16-NEXT: s_branch .LBB10_8 ; GFX1150-TRUE16-NEXT: .LBB10_2: @@ -9862,11 +9903,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v2 ; GFX1150-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s5 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX1150-TRUE16-NEXT: .LBB10_8: ; GFX1150-TRUE16-NEXT: s_lshr_b32 s8, s5, 16 ; GFX1150-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 @@ -9878,11 +9920,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 ; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 -; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 -; GFX1150-TRUE16-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, s8, v1.l, s12 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v1 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v1.l, s8, v1.l, s11 ; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_11 ; GFX1150-TRUE16-NEXT: s_branch .LBB10_16 ; GFX1150-TRUE16-NEXT: .LBB10_10: @@ -9962,11 +10006,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v3 ; GFX1150-TRUE16-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s8 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s8 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v2 ; GFX1150-TRUE16-NEXT: .LBB10_16: ; GFX1150-TRUE16-NEXT: s_and_b32 s8, s7, 0x7fff ; GFX1150-TRUE16-NEXT: s_and_b32 s9, s2, 0x7fff @@ -9976,11 +10021,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 ; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 -; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 -; GFX1150-TRUE16-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1150-TRUE16-NEXT: v_cndmask_b16 v2.l, s7, v2.l, s12 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v2 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v2.l, s7, v2.l, s11 ; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_19 ; GFX1150-TRUE16-NEXT: s_branch .LBB10_24 ; GFX1150-TRUE16-NEXT: .LBB10_18: @@ -10060,11 +10107,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v4 ; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, s7 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, s7 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v3 ; GFX1150-TRUE16-NEXT: .LBB10_24: ; GFX1150-TRUE16-NEXT: s_lshr_b32 s10, s7, 16 ; GFX1150-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 @@ -10076,11 +10124,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 ; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 -; GFX1150-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 -; GFX1150-TRUE16-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1150-TRUE16-NEXT: v_cndmask_b16 v3.l, s10, v3.l, s14 +; GFX1150-TRUE16-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v4, v3 +; GFX1150-TRUE16-NEXT: v_cndmask_b16 v3.l, s10, v3.l, s13 ; GFX1150-TRUE16-NEXT: s_cbranch_execz .LBB10_27 ; GFX1150-TRUE16-NEXT: s_branch .LBB10_32 ; GFX1150-TRUE16-NEXT: .LBB10_26: @@ -10160,11 +10210,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1150-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0, v5 ; GFX1150-TRUE16-NEXT: v_add_f32_e32 v4, v5, v4 ; GFX1150-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, s10 ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, s10 +; GFX1150-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, v4 ; GFX1150-TRUE16-NEXT: .LBB10_32: ; %Flow124 ; GFX1150-TRUE16-NEXT: s_cmp_lg_f16 s4, 0 ; GFX1150-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 @@ -10674,11 +10725,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2 ; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s6 -; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5 -; GFX1200-TRUE16-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s10 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v0.l, s5, v0.l, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_3 ; GFX1200-TRUE16-NEXT: s_branch .LBB10_8 ; GFX1200-TRUE16-NEXT: .LBB10_2: @@ -10761,11 +10814,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s5 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s5 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX1200-TRUE16-NEXT: .LBB10_8: ; GFX1200-TRUE16-NEXT: s_lshr_b32 s8, s5, 16 ; GFX1200-TRUE16-NEXT: s_lshr_b32 s6, s4, 16 @@ -10780,11 +10834,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10 ; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 -; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8 -; GFX1200-TRUE16-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, s8, v1.l, s12 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v1 +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v1.l, s8, v1.l, s11 ; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_11 ; GFX1200-TRUE16-NEXT: s_branch .LBB10_16 ; GFX1200-TRUE16-NEXT: .LBB10_10: @@ -10868,11 +10924,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_add_f32_e32 v2, v3, v2 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s8 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s8 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v2 ; GFX1200-TRUE16-NEXT: .LBB10_16: ; GFX1200-TRUE16-NEXT: s_and_b32 s8, s7, 0x7fff ; GFX1200-TRUE16-NEXT: s_and_b32 s9, s2, 0x7fff @@ -10884,12 +10941,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18 ; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else53 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9 -; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7 -; GFX1200-TRUE16-NEXT: s_cselect_b32 s12, -1, 0 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v2 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cndmask_b16 v2.l, s7, v2.l, s12 +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v2.l, s7, v2.l, s11 ; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_19 ; GFX1200-TRUE16-NEXT: s_branch .LBB10_24 ; GFX1200-TRUE16-NEXT: .LBB10_18: @@ -10973,11 +11032,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v3, v2 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, s7 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, s7 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v3 ; GFX1200-TRUE16-NEXT: .LBB10_24: ; GFX1200-TRUE16-NEXT: s_lshr_b32 s10, s7, 16 ; GFX1200-TRUE16-NEXT: s_lshr_b32 s9, s2, 16 @@ -10992,11 +11052,13 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11 ; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26 ; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else86 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s12, s11 -; GFX1200-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10 -; GFX1200-TRUE16-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1200-TRUE16-NEXT: v_cndmask_b16 v3.l, s10, v3.l, s14 +; GFX1200-TRUE16-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v4, v3 +; GFX1200-TRUE16-NEXT: v_cndmask_b16 v3.l, s10, v3.l, s13 ; GFX1200-TRUE16-NEXT: s_cbranch_execz .LBB10_27 ; GFX1200-TRUE16-NEXT: s_branch .LBB10_32 ; GFX1200-TRUE16-NEXT: .LBB10_26: @@ -11080,11 +11142,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX1200-TRUE16-NEXT: v_add_f32_e32 v4, v5, v4 ; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX1200-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, s10 ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, s10 +; GFX1200-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v3, v4 ; GFX1200-TRUE16-NEXT: .LBB10_32: ; %Flow124 ; GFX1200-TRUE16-NEXT: s_cmp_lg_f16 s4, 0 ; GFX1200-TRUE16-NEXT: s_cselect_b32 s4, -1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 1602e31d6147c..4aba073d70cff 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4393,21 +4393,37 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1 ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-TRUE16-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog +; GFX11-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GFX12-SDAG-TRUE16: ; %bb.0: +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX12-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GFX12-SDAG-FAKE16: ; %bb.0: +; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-FAKE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] +; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi: ; GFX12-GISEL-TRUE16: ; %bb.0: @@ -4439,21 +4455,37 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: ; return to shader part epilog ; -; GFX11-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, v1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-TRUE16-NEXT: ; return to shader part epilog ; -; GFX12-SDAG-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: -; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-SDAG-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v1 -; GFX12-SDAG-NEXT: ; return to shader part epilog +; GFX11-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX12-SDAG-TRUE16: ; %bb.0: +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX12-SDAG-TRUE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX12-SDAG-FAKE16: ; %bb.0: +; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-SDAG-FAKE16-NEXT: global_load_d16_hi_b16 v1, v0, s[2:3] offset:-128 +; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: ; GFX12-GISEL-TRUE16: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll index 7685f7384683b..492bd1b508bc6 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll @@ -231,27 +231,28 @@ define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) { ; GFX11-TRUE16-LABEL: v_uitofp_v2i1_to_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_uitofp_v2i1_to_v2bf16: @@ -287,31 +288,31 @@ define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_uitofp_v2i1_to_v2bf16: @@ -528,37 +529,37 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX11-TRUE16-LABEL: v_uitofp_v3i1_to_v3bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_uitofp_v3i1_to_v3bf16: @@ -605,42 +606,42 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l +; GFX12-TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_uitofp_v3i1_to_v3bf16: @@ -924,42 +925,42 @@ define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) { ; GFX11-TRUE16-LABEL: v_uitofp_v4i1_to_v4bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 1, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v2.l ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h @@ -1019,48 +1020,48 @@ define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v3.l +; GFX12-TRUE16-NEXT: v_and_b16 v2.l, 1, v2.l +; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v2.l ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v0, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo @@ -1554,27 +1555,28 @@ define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) { ; GFX11-TRUE16-LABEL: v_sitofp_v2i1_to_v2bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_sitofp_v2i1_to_v2bf16: @@ -1610,31 +1612,31 @@ define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_sitofp_v2i1_to_v2bf16: @@ -1853,37 +1855,37 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX11-TRUE16-LABEL: v_sitofp_v3i1_to_v3bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_sitofp_v3i1_to_v3bf16: @@ -1930,42 +1932,42 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v2.l +; GFX12-TRUE16-NEXT: v_and_b16 v1.l, 1, v1.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v1.l +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2 +; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 ; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_sitofp_v3i1_to_v3bf16: @@ -2252,42 +2254,42 @@ define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) { ; GFX11-TRUE16-LABEL: v_sitofp_v4i1_to_v4bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 1, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v2.l ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, -1.0, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v9, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h @@ -2347,48 +2349,48 @@ define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v3.l +; GFX12-TRUE16-NEXT: v_and_b16 v2.l, 1, v2.l +; GFX12-TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h +; GFX12-TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v2.l ; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.l +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 1, v0.h ; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, -1.0, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff +; GFX12-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v6, 0x7fff +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v9, v0, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v8, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index 42671f9dd6747..b241b9b800d2a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -830,21 +830,22 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 { ; ; GFX11-TRUE16-LABEL: round_f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v0.h, s2, v0.l -; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, |v0.h|, 0.5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x3c00, s0 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s3, |v0.h|, 0.5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0, 0x3c00, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -991,32 +992,33 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 { ; ; GFX11-TRUE16-LABEL: round_v2f16: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16 -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, s2 -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v1.h, s2, v0.h -; GFX11-TRUE16-NEXT: v_sub_f16_e32 v1.l, s3, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s1, |v1.h|, 0.5 -; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, |v1.l|, 0.5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0, 0x3c00, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x3c00, s0 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, s2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, s3 +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, s2 +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v1.l, s2, v0.l +; GFX11-TRUE16-NEXT: v_sub_f16_e32 v1.h, s3, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s6, |v1.l|, 0.5 +; GFX11-TRUE16-NEXT: v_cmp_ge_f16_e64 s7, |v1.h|, 0.5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0, 0x3c00, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0, 0x3c00, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.h, v0.h, v2.l ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.h, v0.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l -; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index 8036e32f90eb0..d8d8308f6cd8a 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -829,9 +829,7 @@ define half @intrinsic_fround_half(half %arg) { ; GFX11-SDAG-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, |v1.l|, 0.5 ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x3c00, s0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 ; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v1.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -952,12 +950,10 @@ define i32 @intrinsic_lround_i32_f16(half %arg) { ; GFX11-SDAG-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, |v1.l|, 0.5 ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x3c00, s0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0 ; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v1.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll new file mode 100644 index 0000000000000..61a13f54fae79 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.gfx11plus.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11,GFX11-REAL16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16 + +; Make sure no "vgpr32 = copy vgpr16" is generated in true16 mode + +define amdgpu_kernel void @scalar_to_vector_i32 (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %out) { + ; GFX11-REAL16-LABEL: name: scalar_to_vector_i32 + ; GFX11-REAL16: bb.0.entry: + ; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 + ; GFX11-REAL16-NEXT: {{ $}} + ; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4) + ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) + ; GFX11-REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 + ; GFX11-REAL16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec + ; GFX11-REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GFX11-REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec + ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1) + ; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[GLOBAL_LOAD_DWORD_SADDR]].hi16 + ; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[COPY2]], %subreg.lo16, killed [[V_MOV_B16_t16_e64_]], %subreg.hi16 + ; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE]].lo16 + ; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_DWORD_SADDR]] + ; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY3]] + ; GFX11-REAL16-NEXT: [[S_PACK_HL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_HL_B32_B16 [[COPY4]], killed [[COPY5]] + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_PACK_HL_B32_B16_]], %subreg.sub0, [[S_PACK_HL_B32_B16_]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] + ; GFX11-REAL16-NEXT: GLOBAL_STORE_DWORDX2_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY6]], killed [[S_LOAD_DWORDX2_IMM1]], 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1) + ; GFX11-REAL16-NEXT: S_ENDPGM 0 + ; + ; GFX11-FAKE16-LABEL: name: scalar_to_vector_i32 + ; GFX11-FAKE16: bb.0.entry: + ; GFX11-FAKE16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.a.kernarg.offset, align 4, addrspace 4) + ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) + ; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 + ; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GFX11-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec + ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1) + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 117834498 + ; GFX11-FAKE16-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 [[GLOBAL_LOAD_DWORD_SADDR]], [[GLOBAL_LOAD_DWORD_SADDR]], killed [[S_MOV_B32_2]], implicit $exec + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[V_PERM_B32_e64_]], %subreg.sub0, [[V_PERM_B32_e64_]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] + ; GFX11-FAKE16-NEXT: GLOBAL_STORE_DWORDX2_SADDR killed [[V_MOV_B32_e32_]], killed [[COPY2]], killed [[S_LOAD_DWORDX2_IMM1]], 0, 0, implicit $exec :: (store (s64) into %ir.out.load, addrspace 1) + ; GFX11-FAKE16-NEXT: S_ENDPGM 0 +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep1 = getelementptr i32, ptr addrspace(1) %a, i32 %tid + %tmp1 = load i32, ptr addrspace(1) %in.gep1, align 4 + %bc = bitcast i32 %tmp1 to <2 x i16> + %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> poison, <4 x i32> + store <4 x i16> %tmp2, ptr addrspace(1) %out, align 8 + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 7d98f7f1706b2..65a99d0d097f9 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -137,25 +137,45 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; GFX10-NEXT: ds_write_b32 v1, v3 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: local_store_i55: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s3, s1, 0xffff -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: global_load_d16_hi_u8 v0, v0, s[4:5] offset:14 -; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v0, s3, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 -; GFX11-NEXT: ds_store_b8_d16_hi v1, v0 offset:6 -; GFX11-NEXT: ds_store_b16 v1, v2 offset:4 -; GFX11-NEXT: ds_store_b32 v1, v3 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: local_store_i55: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-TRUE16-NEXT: global_load_d16_hi_u8 v1, v0, s[4:5] offset:14 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0xffff +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, s3, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x7fffff, v0 +; GFX11-TRUE16-NEXT: ds_store_b8_d16_hi v1, v0 offset:6 +; GFX11-TRUE16-NEXT: ds_store_b16 v1, v2 offset:4 +; GFX11-TRUE16-NEXT: ds_store_b32 v1, v3 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: local_store_i55: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0xffff +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-FAKE16-NEXT: global_load_d16_hi_u8 v0, v0, s[4:5] offset:14 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, s3, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fffff, v0 +; GFX11-FAKE16-NEXT: ds_store_b8_d16_hi v1, v0 offset:6 +; GFX11-FAKE16-NEXT: ds_store_b16 v1, v2 offset:4 +; GFX11-FAKE16-NEXT: ds_store_b32 v1, v3 +; GFX11-FAKE16-NEXT: s_endpgm store i55 %arg, ptr addrspace(3) %ptr, align 8 ret void }