diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index d4210b8bc9a87..9654a6e1fbd5f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -4127,6 +4127,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16); + Mods ^= SISrcMods::NEG; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16); + Mods ^= SISrcMods::NEG; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + // Match BITOP3 operation and return a number of matched instructions plus // truth table. static std::pair BitOp3_Op(SDValue In, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 4fa0d3f72e1c7..b122b5cd310b3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -272,6 +272,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + + bool SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, SDValue &Tbl) const; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index fb2cd04b364d7..af6d4ff319fd9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1704,6 +1704,8 @@ def VOP3PMadMixModsExt : ComplexPattern; def VOP3PMadMixMods : ComplexPattern; def VOP3PMadMixBF16ModsExt : ComplexPattern; def VOP3PMadMixBF16Mods : ComplexPattern; +def VOP3PMadMixModsNeg : ComplexPattern; +def VOP3PMadMixBF16ModsNeg : ComplexPattern; def VINTERPMods : ComplexPattern; def VINTERPModsHi : ComplexPattern; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index f7279b664ed27..5ea8dbe9a1b7f 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -168,6 +168,7 @@ multiclass MadFmaMixPats { defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); + defvar VOP3PMadMixModsNegPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsNeg, VOP3PMadMixModsNeg); // At least one of the operands needs to be an fpextend of an f16 // for this to be worthwhile, so we need three patterns here. // TODO: Could we use a predicate to inspect src1/2/3 instead? @@ -190,6 +191,30 @@ multiclass MadFmaMixPats; + def : GCNPat < + (f32 (fadd (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))), + (mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1, + DSTCLAMP.NONE)>; + + def : GCNPat < + (f32 (fmul (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))), + (mix_inst $src0_mods, $src0, $src1_mods, $src1, (i32 0), (i32 0), + DSTCLAMP.NONE)>; + + def : GCNPat < + (f32 (fsub (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsNegPat f32:$src1, i32:$src1_mods)))), + (mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1, + DSTCLAMP.NONE)>; + + def : GCNPat < + (f32 (fsub (f32 (VOP3PMadMixModsNegPat f32:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)))), + (mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1, + DSTCLAMP.NONE)>; + def : GCNPat < (AMDGPUclamp (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll index b2b433167fe4d..f9b63ef8e96e5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll @@ -53,16 +53,14 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z, ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX10-NEXT: v_fmac_f16_e32 v3, v0, v1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_fma_mix_f32 v0, v3, 1, v2 op_sel_hi:[1,0,0] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v0, v1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v3, 1, v2 op_sel_hi:[1,0,0] ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul: @@ -70,8 +68,7 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z, ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1 ; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, 1, v2 op_sel_hi:[1,0,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul half %u, %v @@ -129,16 +126,14 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX10-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul_rhs: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0] ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul_rhs: @@ -146,8 +141,7 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half ; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX10-DENORM-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v1, 1, v0 op_sel_hi:[1,0,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul half %u, %v @@ -230,48 +224,36 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul(<4 x half> %x, <4 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v9, v9, v11 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v8 -; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v9 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v2, v4 -; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v3, v5 -; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v8, v6 -; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v9, v7 +; GFX9-DENORM-NEXT: v_pk_add_f16 v2, v0, v8 +; GFX9-DENORM-NEXT: v_pk_add_f16 v3, v1, v9 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul: ; GFX10: ; %bb.0: ; %.entry ; GFX10-NEXT: v_pk_mul_f16 v8, v8, v10 ; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11 -; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v8 -; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v9 -; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_add_f32_e32 v0, v2, v4 -; GFX10-NEXT: v_add_f32_e32 v1, v3, v5 -; GFX10-NEXT: v_add_f32_e32 v2, v8, v6 -; GFX10-NEXT: v_add_f32_e32 v3, v9, v7 +; GFX10-NEXT: v_pk_fma_f16 v2, v0, v2, v8 +; GFX10-NEXT: v_pk_fma_f16 v3, v1, v3, v9 +; GFX10-NEXT: v_fma_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul: ; GFX10-CONTRACT: ; %bb.0: ; %.entry ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v8, v8, v10 ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v8 -; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v9 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v3, v5 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v8, v6 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v9, v7 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v2, v0, v2, v8 +; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v3, v1, v3, v9 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul: @@ -280,16 +262,12 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul(<4 x half> %x, <4 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v9, v11 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 -; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v8 -; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v2 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v2, v4 -; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v3, v5 -; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v8, v6 -; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v9, v7 +; GFX10-DENORM-NEXT: v_pk_add_f16 v3, v0, v8 +; GFX10-DENORM-NEXT: v_pk_add_f16 v8, v1, v2 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v3, 1, v4 op_sel_hi:[1,0,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v3, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v8, 1, v6 op_sel_hi:[1,0,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v8, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul <4 x half> %u, %v @@ -374,14 +352,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> % ; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7 ; GFX9-DENORM-NEXT: v_pk_add_f16 v4, v4, v8 ; GFX9-DENORM-NEXT: v_pk_add_f16 v5, v5, v9 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0] +; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs: @@ -390,14 +364,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> % ; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11 ; GFX10-NEXT: v_pk_fma_f16 v4, v4, v6, v8 ; GFX10-NEXT: v_pk_fma_f16 v5, v5, v7, v9 -; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v4 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX10-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX10-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs: @@ -406,14 +376,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> % ; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v6, v4 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0] +; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-CONTRACT-NEXT: ; return to shader part epilog ; ; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs: @@ -424,14 +390,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> % ; GFX10-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7 ; GFX10-DENORM-NEXT: v_pk_add_f16 v4, v4, v8 ; GFX10-DENORM-NEXT: v_pk_add_f16 v5, v5, v6 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5 -; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v4 -; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0] +; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX10-DENORM-NEXT: ; return to shader part epilog .entry: %a = fmul <4 x half> %u, %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll index 4d603f7487754..26f8e41c9351a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll @@ -49,21 +49,16 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x, ; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul: ; GFX9-FAST-DENORM: ; %bb.0: ; %.entry ; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s0, v0 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v3, s1, v0 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v4, s2, v0 +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v0, v1, 1, s6 op_sel_hi:[1,0,0] +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v1, v1, 1, s7 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1, s8 op_sel_hi:[1,0,0] +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1, s9 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v4, v4, 1, s10 op_sel_hi:[1,0,0] ; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul: @@ -94,23 +89,17 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg ; GFX9-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs: ; GFX9-FAST-DENORM: ; %bb.0: ; %.entry ; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1 -; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2 -; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7 -; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v5, s11, v8 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s0, v0 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v3, s1, v0 +; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v5, s2, v0 +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v0, v1, 1, s6 op_sel_hi:[1,0,0] +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v1, v1, 1, s7 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1, s8 op_sel_hi:[1,0,0] +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1, s9 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v4, v5, 1, s10 op_sel_hi:[1,0,0] +; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v5, v5, 1, s11 op_sel:[1,0,0] op_sel_hi:[1,0,0] ; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog ; ; GFX10-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll index 1aee6ab24eea0..ed80469052d17 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll @@ -118,9 +118,8 @@ define half @v_fdiv_f16(half %a, half %b) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, v0, v2, 0 op_sel_hi:[1,0,0] ; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v2 ; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] @@ -135,18 +134,17 @@ define half @v_fdiv_f16(half %a, half %b) { ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v3, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, v0, 1, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, v0, 1, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v3 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -157,7 +155,7 @@ define half @v_fdiv_f16(half %a, half %b) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v5, v0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v4 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v3 ; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v4 @@ -172,10 +170,9 @@ define half @v_fdiv_f16(half %a, half %b) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_fma_mix_f32 v3, v0, v2, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2 ; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] @@ -328,9 +325,8 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, v0, v2, 0 op_sel_hi:[1,0,0] ; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v2 ; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] @@ -345,18 +341,17 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) { ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v3, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, v0, 1, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, v0, 1, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v3 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] @@ -367,7 +362,7 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v5, v0, v3, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v4 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v3 ; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v4 @@ -382,10 +377,9 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2 +; GFX11-NEXT: v_fma_mix_f32 v3, v0, v2, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2 ; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1] @@ -942,31 +936,29 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v5 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v7, v7 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v8, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v5, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v6, v7 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v7 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v6, v7 -; GFX9-FLUSH-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v7, v6, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v5, v3 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v3, v5 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -975,32 +967,30 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v8, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v12, -v3, v10 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v13, -v4, v11 -; GFX10-IEEE-NEXT: v_add_f32_e32 v12, v12, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v13, v13, v9 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v12, v12, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v13, v13, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v12, v10 -; GFX10-IEEE-NEXT: v_add_f32_e32 v11, v13, v11 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v10 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v11 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, -v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, v0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, v0, 1, v8 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v8, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, -v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, v0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, v0, 1, v8 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v8, v4 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v10 -; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v6 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0 @@ -1013,30 +1003,30 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v7, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v10, v8, v6 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v11, v9, v7 -; GFX10-FLUSH-NEXT: v_mad_f32 v12, -v3, v10, v8 -; GFX10-FLUSH-NEXT: v_mad_f32 v13, -v4, v11, v9 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v10, v12, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v11, v13, v7 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v10, v8 -; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v4, v11, v9 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v10, v7 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v9, v0, v5, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v11, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_mad_f32 v12, -v3, v9, v8 +; GFX10-FLUSH-NEXT: v_mad_f32 v13, -v4, v11, v10 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v9, v12, v5 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v11, v13, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v8 +; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v4, v11, v10 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v6 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v10 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 ; GFX10-FLUSH-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v2, v7 ; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1045,25 +1035,24 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4 -; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4 -; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_and_b32 v4, 0xff800000, v4 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v7 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_mix_f32 v5, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v5, v7, v3 :: v_dual_fmac_f32 v6, v8, v4 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v4, v8, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-NEXT: v_dual_add_f32 v4, v4, v6 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1320,31 +1309,29 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v5 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v3 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v7, v7 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v8, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v5, v0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v6, v7 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v7 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v6, v7 -; GFX9-FLUSH-NEXT: v_and_b32_e32 v6, 0xff800000, v6 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v7, v6, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v4, v4, v7 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0 -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v5, v3 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v3, v5 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1353,32 +1340,30 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v3 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v8, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v12, -v3, v10 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v13, -v4, v11 -; GFX10-IEEE-NEXT: v_add_f32_e32 v12, v12, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v13, v13, v9 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v12, v12, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v13, v13, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v12, v10 -; GFX10-IEEE-NEXT: v_add_f32_e32 v11, v13, v11 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v10 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v11 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v9 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, -v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, v0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, v0, 1, v8 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v8, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, -v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, v0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, v0, 1, v8 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v8, v4 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v10 -; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v11 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v6 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0 @@ -1391,30 +1376,30 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v0 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v3 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v9, v5 -; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v7, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v10, v8, v6 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v11, v9, v7 -; GFX10-FLUSH-NEXT: v_mad_f32 v12, -v3, v10, v8 -; GFX10-FLUSH-NEXT: v_mad_f32 v13, -v4, v11, v9 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v10, v12, v6 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v11, v13, v7 -; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v10, v8 -; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v4, v11, v9 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v7 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v10, v7 +; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v9, v0, v5, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v11, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_mad_f32 v12, -v3, v9, v8 +; GFX10-FLUSH-NEXT: v_mad_f32 v13, -v4, v11, v10 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v9, v12, v5 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v11, v13, v6 +; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v8 +; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v4, v11, v10 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v6 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v10 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9 ; GFX10-FLUSH-NEXT: v_add_f32_e32 v4, v4, v11 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v1, v0 -; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v2, v5 +; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v2, v7 ; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] ; @@ -1423,25 +1408,24 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4 -; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4 -; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_and_b32 v4, 0xff800000, v4 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v7 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_fma_mix_f32 v5, v0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v5, v7, v3 :: v_dual_fmac_f32 v6, v8, v4 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, v0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v4, v8, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-NEXT: v_dual_add_f32 v4, v4, v6 :: v_dual_and_b32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0 ; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 @@ -1631,23 +1615,22 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v5, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v6, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v5, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -1660,30 +1643,29 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, 1.0, 1, v6 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, 1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, 1.0, 1, v6 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, 1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -1700,8 +1682,8 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v7, 1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v8, 1.0, v5, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 ; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 @@ -1726,26 +1708,24 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v5, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v4, v6, v2 :: v_dual_fmac_f32 v5, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_mul_f32 v3, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1934,23 +1914,22 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v5, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v6, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v5, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0 @@ -1963,30 +1942,29 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -1.0, 1, v6 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -1.0, 1, v6 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -2003,8 +1981,8 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v7, -1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v8, -1.0, v5, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 ; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 @@ -2029,26 +2007,24 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v5, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v4, v6, v2 :: v_dual_fmac_f32 v5, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_mul_f32 v3, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2251,23 +2227,22 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, 1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -|v0|, v7, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v7, v6, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -2278,38 +2253,37 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-IEEE-LABEL: v_rcp_v2f16_fabs: ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 -; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, 1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, -|v0|, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, 1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, 1.0, 1, v8 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v8, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v0, -|v0|, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, 1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v0, 1.0, 1, v0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 -; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 -; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_rcp_v2f16_fabs: @@ -2322,8 +2296,8 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v7, 1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v8, 1.0, v5, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 ; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 @@ -2346,30 +2320,31 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) { ; GFX11-LABEL: v_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4 -; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4 +; GFX11-NEXT: v_fma_mix_f32 v6, 1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v6, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v6, v8, v4 +; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v6, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_mix_f32 v5, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2573,23 +2548,22 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -|v0|, v7, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v7, v6, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2 ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -2600,38 +2574,37 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-IEEE-LABEL: v_neg_rcp_v2f16_fabs: ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0 -; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX10-IEEE-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 -; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 +; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v4 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, -|v0|, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v8, -1.0, 1, v8 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v8, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v8, v6 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v0, -|v0|, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v0, -1.0, 1, v0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 -; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v6 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 +; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 -; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0 +; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-FLUSH-LABEL: v_neg_rcp_v2f16_fabs: @@ -2644,8 +2617,8 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v7, -1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v8, -1.0, v5, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 ; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 @@ -2668,30 +2641,31 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) { ; GFX11-LABEL: v_neg_rcp_v2f16_fabs: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0 ; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX11-NEXT: v_rcp_f32_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3 ; GFX11-NEXT: v_rcp_f32_e32 v4, v4 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1] ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4 -; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4 -; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4 +; GFX11-NEXT: v_fma_mix_f32 v6, -1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v6, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v6, v8, v4 +; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v6, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX11-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_fma_mix_f32 v5, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fmac_f32_e32 v5, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3041,23 +3015,22 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, 1.0, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v5, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v6, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v5, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0 @@ -3070,30 +3043,29 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, 1.0, 1, v6 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, 1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v6, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v7, v5 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, 1.0, 1, v6 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, 1.0, 1, v7 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 @@ -3110,8 +3082,8 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v7, 1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v8, 1.0, v5, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 ; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 @@ -3136,26 +3108,24 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v5, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v4, v6, v2 :: v_dual_fmac_f32 v5, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_mul_f32 v3, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -3510,36 +3480,34 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) { ; GFX9-FLUSH-LABEL: s_fdiv_f16: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v0 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, -v2, v1, s0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v1, v3, v0 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, -v2, v1, s0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v2, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, v3, v0 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v3, v0 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v1, s0 ; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-FLUSH-NEXT: ; return to shader part epilog ; ; GFX10-IEEE-LABEL: s_fdiv_f16: ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v2, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v4, -v0, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v1 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v1, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v2, -s1, v1, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v2, s0, 1, v2 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v0 +; GFX10-IEEE-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v2, -s1, v1, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v2, s0, 1, v2 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v0, s1, s0 ; GFX10-IEEE-NEXT: v_readfirstlane_b32 s0, v0 @@ -3550,7 +3518,7 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v1, v0 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v2, v1 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v3, s0, v1, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v0, v3, v2 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v1 ; GFX10-FLUSH-NEXT: v_mad_f32 v0, -v0, v3, v2 @@ -3565,10 +3533,9 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) { ; GFX11-LABEL: s_fdiv_f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0 +; GFX11-NEXT: v_fma_mix_f32 v1, s0, v0, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v0 ; GFX11-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1] @@ -3894,33 +3861,31 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX9-FLUSH-LABEL: s_fdiv_v2f16: ; GFX9-FLUSH: ; %bb.0: ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0 -; GFX9-FLUSH-NEXT: s_lshr_b32 s3, s1, 16 +; GFX9-FLUSH-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, s3 -; GFX9-FLUSH-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v0 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v2, v1, s0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v1, v4, v0 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v2, v1, s0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v2, v3, s0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v0 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v2, v3, s0 op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s2 -; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-FLUSH-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v4, s2 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v5, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v4, s2 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, s3, v1, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v3, v4, s3 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v5, v1 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v3, v4, s3 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 +; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v4 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0 -; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, s2 +; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v3, s3 ; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-FLUSH-NEXT: ; return to shader part epilog @@ -3931,30 +3896,28 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX10-IEEE-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, s0 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v5, s3 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v0 -; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v1 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v4, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v5, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v8, -v0, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v1, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v8, v4 -; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v8, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v8, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v6 -; GFX10-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7 -; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX10-IEEE-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v2, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v3, s3, v1, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -s1, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -s2, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, s0, 1, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, s3, 1, v5 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v5, v1 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v4, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -s1, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -s2, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, s0, 1, v4 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, s3, 1, v5 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX10-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v6 -; GFX10-IEEE-NEXT: v_add_f32_e32 v1, v1, v7 +; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX10-IEEE-NEXT: v_add_f32_e32 v1, v1, v3 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v0, s1, s0 @@ -3970,22 +3933,22 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX10-FLUSH-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s0 -; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, s3 +; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, s3 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v5, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v8, -v0, v6, v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v1, v7, v5 -; GFX10-FLUSH-NEXT: v_mac_f32_e32 v6, v8, v2 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v5, s0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v7, s3, v3, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_mad_f32 v8, -v0, v5, v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v1, v7, v6 +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v5, v8, v2 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v3 -; GFX10-FLUSH-NEXT: v_mad_f32 v0, -v0, v6, v4 -; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v5 +; GFX10-FLUSH-NEXT: v_mad_f32 v0, -v0, v5, v4 +; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v6 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX10-FLUSH-NEXT: v_add_f32_e32 v0, v0, v6 +; GFX10-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5 ; GFX10-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -4001,12 +3964,11 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) { ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1 ; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2 ; GFX11-NEXT: s_lshr_b32 s3, s0, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s3 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mul_f32 v2, v2, v0 :: v_dual_mul_f32 v3, v3, v1 +; GFX11-NEXT: v_fma_mix_f32 v2, s0, v0, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v3, s3, v1, 0 op_sel_hi:[1,0,0] ; GFX11-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_fma_mix_f32 v5, -s2, v3, s3 op_sel_hi:[1,0,1] ; GFX11-NEXT: v_dual_fmac_f32 v2, v4, v0 :: v_dual_fmac_f32 v3, v5, v1 @@ -4434,25 +4396,24 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16 ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v5, v2 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v6, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v5, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v6, -1.0 op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4466,25 +4427,24 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16 ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, -1.0 op_sel_hi:[1,0,1] ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 @@ -4503,8 +4463,8 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v7, -1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v8, -1.0, v5, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 ; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 @@ -4530,28 +4490,26 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) { ; GFX11-NEXT: s_lshr_b32 s1, s0, 16 ; GFX11-NEXT: v_sqrt_f16_e32 v0, s0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, s1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 ; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v5, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v4, v6, v2 :: v_dual_fmac_f32 v5, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_mul_f32 v3, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: v_readfirstlane_b32 s0, v0 @@ -5549,25 +5507,24 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v2 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, 1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2 ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5580,25 +5537,24 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v4, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v4, 1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1] ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 @@ -5616,8 +5572,8 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v7, 1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v8, 1.0, v5, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 ; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 @@ -5642,7 +5598,6 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -5650,21 +5605,20 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, 1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v5, 1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v4, v6, v2 :: v_dual_fmac_f32 v5, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, 1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_mul_f32 v3, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5863,25 +5817,24 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2 -; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3 -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] -; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v2 +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v3 +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2 ; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5894,25 +5847,24 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 ; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 ; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1] -; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v2 +; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v3 +; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1] +; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1] ; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3 ; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2 ; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4 +; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 @@ -5930,8 +5882,8 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5 +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v7, -1.0, v4, 0 op_sel_hi:[1,0,0] +; GFX10-FLUSH-NEXT: v_fma_mix_f32 v8, -1.0, v5, 0 op_sel_hi:[1,0,0] ; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6 ; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6 ; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4 @@ -5956,7 +5908,6 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0 ; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0 @@ -5964,21 +5915,20 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { ; GFX11-NEXT: v_rcp_f32_e32 v2, v2 ; GFX11-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2 -; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2 -; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3 -; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1] -; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3 -; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_fma_mix_f32 v4, -1.0, v2, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v5, -1.0, v3, 0 op_sel_hi:[1,0,0] +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_fmac_f32 v4, v6, v2 :: v_dual_fmac_f32 v5, v7, v3 +; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v4, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v5, -1.0 op_sel_hi:[1,0,1] +; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_mul_f32 v3, v7, v3 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v5 :: v_dual_and_b32 v2, 0xff800000, v2 +; GFX11-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v5 -; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0 ; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0 ; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 9e240238c1066..f8bfe2bf1e2f1 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -625,26 +625,22 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 ; GFX908-NEXT: ; %bb.6: ; %bb51 ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21 -; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX908-NEXT: v_cvt_f32_f16_e32 v20, v20 -; GFX908-NEXT: v_add_f32_e32 v24, v18, v12 -; GFX908-NEXT: v_add_f32_e32 v25, v19, v13 -; GFX908-NEXT: v_add_f32_e32 v26, 0, v12 -; GFX908-NEXT: v_add_f32_e32 v27, 0, v13 -; GFX908-NEXT: v_add_f32_e32 v15, v22, v15 -; GFX908-NEXT: v_add_f32_e32 v14, v21, v14 -; GFX908-NEXT: v_add_f32_e32 v13, v23, v13 -; GFX908-NEXT: v_add_f32_e32 v12, v20, v12 -; GFX908-NEXT: v_add_f32_e32 v5, v5, v25 -; GFX908-NEXT: v_add_f32_e32 v4, v4, v24 -; GFX908-NEXT: v_add_f32_e32 v7, v7, v27 -; GFX908-NEXT: v_add_f32_e32 v6, v6, v26 -; GFX908-NEXT: v_add_f32_e32 v8, v8, v14 +; GFX908-NEXT: v_add_f32_e32 v22, v18, v12 +; GFX908-NEXT: v_add_f32_e32 v23, v19, v13 +; GFX908-NEXT: v_add_f32_e32 v24, 0, v12 +; GFX908-NEXT: v_add_f32_e32 v25, 0, v13 +; GFX908-NEXT: v_fma_mix_f32 v14, v21, 1, v14 op_sel_hi:[1,0,0] +; GFX908-NEXT: v_fma_mix_f32 v15, v21, 1, v15 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX908-NEXT: v_fma_mix_f32 v12, v20, 1, v12 op_sel_hi:[1,0,0] +; GFX908-NEXT: v_fma_mix_f32 v13, v20, 1, v13 op_sel:[1,0,0] op_sel_hi:[1,0,0] +; GFX908-NEXT: v_add_f32_e32 v5, v5, v23 +; GFX908-NEXT: v_add_f32_e32 v4, v4, v22 +; GFX908-NEXT: v_add_f32_e32 v7, v7, v25 +; GFX908-NEXT: v_add_f32_e32 v6, v6, v24 ; GFX908-NEXT: v_add_f32_e32 v9, v9, v15 -; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 +; GFX908-NEXT: v_add_f32_e32 v8, v8, v14 ; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 +; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 ; GFX908-NEXT: s_branch .LBB3_4 ; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 ; GFX908-NEXT: s_mov_b64 s[22:23], s[18:19] diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 210e09fd9169a..fe6a75d4896ca 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -89,10 +89,9 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX9-NEXT: v_rcp_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX9-NEXT: v_mad_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] ; GFX9-NEXT: v_mac_f32_e32 v4, v5, v3 ; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] @@ -115,16 +114,16 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX10-NEXT: v_rcp_f32_e32 v4, v3 -; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 -; GFX10-NEXT: v_mad_f32 v7, -v3, v6, v5 -; GFX10-NEXT: v_mac_f32_e32 v6, v7, v4 -; GFX10-NEXT: v_mad_f32 v3, -v3, v6, v5 +; GFX10-NEXT: v_fma_mix_f32 v5, v1, v4, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_mad_f32 v7, -v3, v5, v6 +; GFX10-NEXT: v_mac_f32_e32 v5, v7, v4 +; GFX10-NEXT: v_mad_f32 v3, -v3, v5, v6 ; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v1, v3, v2, v1 ; GFX10-NEXT: global_store_short v0, v1, s[0:1] @@ -143,23 +142,22 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, v4, v3, 0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v6, v5, v4 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v5, v7, v3 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v6, v5, v4 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l @@ -179,12 +177,11 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v3 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 diff --git a/llvm/test/CodeGen/AMDGPU/fpext-free.ll b/llvm/test/CodeGen/AMDGPU/fpext-free.ll index b88cb210c91e8..428b80ebf396e 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext-free.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext-free.ll @@ -14,18 +14,16 @@ define float @fadd_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, 1, v2 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fadd_fpext_fmul_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, 1, v2 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32: @@ -118,18 +116,16 @@ define float @fadd_fpext_fmul_f16_to_f32_commute(half %x, half %y, float %z) #0 ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, 1, v2 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fadd_fpext_fmul_f16_to_f32_commute: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, 1, v2 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32FLUSH-LABEL: fadd_fpext_fmul_f16_to_f32_commute: @@ -380,12 +376,10 @@ define float @fadd_fpext_fmuladd_f16_to_f32(float %x, half %y, half %z, half %u, ; GFX11-TRUE16-LABEL: fadd_fpext_fmuladd_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mul_f16_e32 v1.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v3.l, v3.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v1.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fadd_fpext_fmuladd_f16_to_f32: @@ -394,9 +388,7 @@ define float @fadd_fpext_fmuladd_f16_to_f32(float %x, half %y, half %z, half %u, ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32FLUSH-LABEL: fadd_fpext_fmuladd_f16_to_f32: @@ -426,12 +418,10 @@ define float @fadd_fpext_fma_f16_to_f32(float %x, half %y, half %z, half %u, hal ; GFX11-TRUE16-LABEL: fadd_fpext_fma_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mul_f16_e32 v1.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v3.l, v3.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v1.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fadd_fpext_fma_f16_to_f32: @@ -440,9 +430,7 @@ define float @fadd_fpext_fma_f16_to_f32(float %x, half %y, half %z, half %u, hal ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32: @@ -472,12 +460,10 @@ define float @fadd_fpext_fma_f16_to_f32_commute(float %x, half %y, half %z, half ; GFX11-TRUE16-LABEL: fadd_fpext_fma_f16_to_f32_commute: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mul_f16_e32 v1.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v3.l, v3.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v1.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fadd_fpext_fma_f16_to_f32_commute: @@ -486,9 +472,7 @@ define float @fadd_fpext_fma_f16_to_f32_commute(float %x, half %y, half %z, half ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32FLUSH-LABEL: fadd_fpext_fma_f16_to_f32_commute: @@ -521,18 +505,16 @@ define float @fsub_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, 1, -v2 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fsub_fpext_fmul_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, 1, -v2 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32: @@ -568,18 +550,16 @@ define float @fsub_fpext_fmul_f16_to_f32_commute(float %x, half %y, half %z) #0 ; GFX11-F32DENORM-TRUE16: ; %bb.0: ; %entry ; GFX11-F32DENORM-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-F32DENORM-TRUE16-NEXT: v_mul_f16_e32 v1.l, v1.l, v2.l -; GFX11-F32DENORM-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-F32DENORM-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l -; GFX11-F32DENORM-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-F32DENORM-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-F32DENORM-TRUE16-NEXT: v_fma_mix_f32 v0, -v0, 1, v1 op_sel_hi:[0,0,1] ; GFX11-F32DENORM-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-F32DENORM-FAKE16-LABEL: fsub_fpext_fmul_f16_to_f32_commute: ; GFX11-F32DENORM-FAKE16: ; %bb.0: ; %entry ; GFX11-F32DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-F32DENORM-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2 -; GFX11-F32DENORM-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-F32DENORM-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-F32DENORM-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-F32DENORM-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-F32DENORM-FAKE16-NEXT: v_fma_mix_f32 v0, -v0, 1, v1 op_sel_hi:[0,0,1] ; GFX11-F32DENORM-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32FLUSH-LABEL: fsub_fpext_fmul_f16_to_f32_commute: @@ -609,18 +589,16 @@ define float @fsub_fpext_fneg_fmul_f16_to_f32(half %x, half %y, float %z) #0 { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, 1, -v2 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fsub_fpext_fneg_fmul_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, 1, -v2 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32FLUSH-LABEL: fsub_fpext_fneg_fmul_f16_to_f32: @@ -651,18 +629,16 @@ define float @fsub_fneg_fpext_fmul_f16_to_f32(half %x, half %y, float %z) #0 { ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, 1, -v2 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fsub_fneg_fpext_fmul_f16_to_f32: ; GFX11-FAKE16: ; %bb.0: ; %entry ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, 1, -v2 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-F32FLUSH-LABEL: fsub_fneg_fpext_fmul_f16_to_f32: @@ -738,12 +714,10 @@ define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half ; GFX11-TRUE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v3.l, v3.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v0.h, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v3, 1, -v2 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32: @@ -752,19 +726,25 @@ define float @fsub_fpext_muladd_mul_f16_to_f32(half %x, half %y, float %z, half ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v3, v0, v1 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v3, 1, -v2 op_sel_hi:[1,0,0] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX89-NEXT: v_fma_f16 v0, v0, v1, v3 -; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX89-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX9-F32FLUSH-LABEL: fsub_fpext_muladd_mul_f16_to_f32: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32FLUSH-NEXT: v_fma_f16 v0, v0, v1, v3 +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, v0, 1, -v2 op_sel_hi:[1,0,0] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fsub_fpext_muladd_mul_f16_to_f32: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32DENORM-NEXT: v_fma_f16 v0, v0, v1, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %u, %v %fma = call half @llvm.fmuladd.f16(half %x, half %y, half %mul) @@ -824,12 +804,10 @@ define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half % ; GFX11-TRUE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute: ; GFX11-TRUE16: ; %bb.0: ; %entry ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mul_f16_e32 v1.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v3.l, v3.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v1.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, -v0, 1, v3 op_sel_hi:[0,0,1] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute: @@ -838,19 +816,25 @@ define float @fsub_fpext_muladd_mul_f16_to_f32_commute(float %x, half %y, half % ; GFX11-FAKE16-NEXT: v_mul_f16_e32 v3, v3, v4 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, -v0, 1, v3 op_sel_hi:[0,0,1] ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; GFX89-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute: -; GFX89: ; %bb.0: ; %entry -; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX89-NEXT: v_mul_f16_e32 v3, v3, v4 -; GFX89-NEXT: v_fma_f16 v1, v1, v2, v3 -; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX89-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX89-NEXT: s_setpc_b64 s[30:31] +; GFX9-F32FLUSH-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute: +; GFX9-F32FLUSH: ; %bb.0: ; %entry +; GFX9-F32FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32FLUSH-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32FLUSH-NEXT: v_fma_f16 v1, v1, v2, v3 +; GFX9-F32FLUSH-NEXT: v_mad_mix_f32 v0, -v0, 1, v1 op_sel_hi:[0,0,1] +; GFX9-F32FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-F32DENORM-LABEL: fsub_fpext_muladd_mul_f16_to_f32_commute: +; GFX9-F32DENORM: ; %bb.0: ; %entry +; GFX9-F32DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-F32DENORM-NEXT: v_mul_f16_e32 v3, v3, v4 +; GFX9-F32DENORM-NEXT: v_fma_f16 v1, v1, v2, v3 +; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9-F32DENORM-NEXT: s_setpc_b64 s[30:31] entry: %mul = fmul half %u, %v %fma = call half @llvm.fmuladd.f16(half %y, half %z, half %mul) diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index c4a38dcd7b5f3..1032656301220 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1380,18 +1380,16 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 -; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mac_f32_e32 v4, v5, v3 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 @@ -1410,17 +1408,17 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v5, v4 -; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 -; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 -; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 -; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-NEXT: v_fma_mix_f32 v5, v1, v4, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_mad_f32 v7, -v3, v5, v6 +; GFX10-NEXT: v_mac_f32_e32 v5, v7, v4 +; GFX10-NEXT: v_mad_f32 v3, -v3, v5, v6 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 @@ -1439,24 +1437,23 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] ; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, v4, v3, 0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v6, v5, v4 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v5, v7, v3 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v6, v5, v4 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l @@ -1476,23 +1473,21 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 @@ -1513,23 +1508,22 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] ; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, v4, v3, 0 op_sel_hi:[1,0,0] ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v6, v5, v4 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v5, v7, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v4, -v6, v5, v4 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 ; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l @@ -1551,22 +1545,20 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-FAKE16-NEXT: s_clause 0x1 ; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 @@ -1589,23 +1581,22 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] ; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, v4, v3, 0 op_sel_hi:[1,0,0] ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v6, v5, v4 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v5, v7, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v4, -v6, v5, v4 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 ; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l @@ -1627,22 +1618,20 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1200-FAKE16-NEXT: s_clause 0x1 ; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 @@ -1787,18 +1776,16 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4 -; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX9-NEXT: v_rcp_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mac_f32_e32 v4, v5, v3 +; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX9-NEXT: v_trunc_f16_e32 v3, v3 @@ -1817,17 +1804,17 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] offset:8 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2 -; GFX10-NEXT: v_rcp_f32_e32 v5, v4 -; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5 -; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3 -; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5 -; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 +; GFX10-NEXT: v_rcp_f32_e32 v4, v3 +; GFX10-NEXT: v_fma_mix_f32 v5, v1, v4, 0 op_sel_hi:[1,0,0] +; GFX10-NEXT: v_mad_f32 v7, -v3, v5, v6 +; GFX10-NEXT: v_mac_f32_e32 v5, v7, v4 +; GFX10-NEXT: v_mad_f32 v3, -v3, v5, v6 +; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 -; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 +; GFX10-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1 ; GFX10-NEXT: v_trunc_f16_e32 v3, v3 @@ -1846,24 +1833,23 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] ; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, v4, v3, 0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v6, v5, v4 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v5, v7, v3 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v6, v5, v4 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l @@ -1883,23 +1869,21 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-FAKE16-NEXT: s_clause 0x1 ; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 +; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 -; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] +; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 +; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 +; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 @@ -1920,23 +1904,22 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] ; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l +; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, v4, v3, 0 op_sel_hi:[1,0,0] ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v6, v5, v4 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v5, v7, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v4, -v6, v5, v4 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 ; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l @@ -1958,22 +1941,20 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-FAKE16-NEXT: s_clause 0x1 ; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-FAKE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 @@ -1996,23 +1977,22 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] ; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l +; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, v4, v3, 0 op_sel_hi:[1,0,0] ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v6, v5, v4 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v5, v7, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v4, -v6, v5, v4 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v4, v3 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v3, v5 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 ; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l @@ -2034,22 +2014,20 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-FAKE16-NEXT: s_clause 0x1 ; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] ; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8 -; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1 ; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2 +; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v2 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v4, v1, v3, 0 op_sel_hi:[1,0,0] ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1] -; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] +; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v5, v3 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-FAKE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4 ; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index b67a1c513c49f..31f1c502f3b61 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -13656,21 +13656,17 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX1250-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l -; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v5, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -13693,7 +13689,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -13709,14 +13705,12 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v5, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -14253,21 +14247,17 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX1250-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l -; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v5, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -14292,13 +14282,13 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_bitop2_b32 v3, 3, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX1250-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -14306,14 +14296,12 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v5, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -14868,21 +14856,17 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX1250-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l -; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v5, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -14906,16 +14890,15 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX1250-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -14923,14 +14906,12 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v5, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -15487,17 +15468,14 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v4, v4, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -15521,7 +15499,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -15536,13 +15514,12 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v4, v4, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 -; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -16066,17 +16043,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v4, v4, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -16102,36 +16076,35 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v6, 16, v2 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 -; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v4, 3, v4 bitop3:0x40 -; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX1250-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4 ; GFX1250-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0 -; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v4, v4, 1, v2 op_sel_hi:[1,0,1] +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16663,17 +16636,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v4, v4, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -16698,39 +16668,37 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 -; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 -; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4 ; GFX1250-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0 -; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v4, v4, 1, v2 op_sel_hi:[1,0,1] +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -17254,17 +17222,13 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v3 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v3, v5, 1, v2 op_sel_hi:[1,0,1] +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v4 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 @@ -17287,19 +17251,16 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX1250-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v3, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -17739,17 +17700,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX1250-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v3 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l -; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l -; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v3, v5, 1, v2 op_sel_hi:[1,0,1] +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v4 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0 @@ -17772,28 +17730,26 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off offset:2046 ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX1250-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v3, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2 +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5 ; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -18218,21 +18174,17 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX1250-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l -; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v5, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 ; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6 -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -18257,13 +18209,13 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_bitop2_b32 v3, 3, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 ; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX1250-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -18271,14 +18223,12 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7 -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v5, v5, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2 ; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 @@ -18839,17 +18789,14 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l -; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_fma_mix_f32_bf16 v4, v4, 1, v2 op_sel_hi:[1,0,1] ; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 -; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l ; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7 +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -18875,36 +18822,35 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v6, 16, v2 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 -; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v4, 3, v4 bitop3:0x40 -; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX1250-FAKE16-NEXT: v_not_b32_e32 v5, v5 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4 +; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4 ; GFX1250-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0 -; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX1250-FAKE16-NEXT: v_fma_mix_f32_bf16 v4, v4, 1, v2 op_sel_hi:[1,0,1] +; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2 -; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4 ; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 ; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0 -; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4 ; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll index c96ba754c0811..b01cef5231481 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll @@ -389,10 +389,9 @@ define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat % ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_mul_f32 v0, v0, v1 -; GFX1250-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v2, 1, v0 op_sel_hi:[1,0,0] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float %src1.ext = fpext bfloat %src1 to float @@ -407,9 +406,8 @@ define float @v_mad_mix_f32_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat %src0, b ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %src0.ext = fpext bfloat %src0 to float @@ -616,6 +614,644 @@ entry: ret void } +define float @v_mad_mix_f32_bf16lo_add_bf16lo(bfloat %src0, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1, v1 op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_bf16hi_add_bf16hi_int(i32 %src0, i32 %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16hi_add_bf16hi_int: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.hi = lshr i32 %src0, 16 + %src1.hi = lshr i32 %src1, 16 + %src0.i16 = trunc i32 %src0.hi to i16 + %src1.i16 = trunc i32 %src1.hi to i16 + %src0.fp16 = bitcast i16 %src0.i16 to bfloat + %src1.fp16 = bitcast i16 %src1.i16 to bfloat + %src0.ext = fpext bfloat %src0.fp16 to float + %src1.ext = fpext bfloat %src1.fp16 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_bf16hi_and_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16hi_and_bf16hi_elt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.hi = extractelement <2 x bfloat> %src0, i32 1 + %src1.hi = extractelement <2 x bfloat> %src1, i32 1 + %src0.ext = fpext bfloat %src0.hi to float + %src1.ext = fpext bfloat %src1.hi to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define <2 x float> @v_mad_mix_v2f32_cvt_add(<2 x bfloat> %src0, <2 x bfloat> %src1) { +; GFX1250-LABEL: v_mad_mix_v2f32_cvt_add: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[4:5] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %result = fadd <2 x float> %src0.ext, %src1.ext + ret <2 x float> %result +} + +define <2 x float> @v_mad_mix_v2f32_shuffle_cvt_add(<2 x bfloat> %src0, <2 x bfloat> %src1) { +; GFX1250-LABEL: v_mad_mix_v2f32_shuffle_cvt_add: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v3, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[4:5] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.shuf = shufflevector <2 x bfloat> %src0, <2 x bfloat> poison, <2 x i32> + %src1.shuf = shufflevector <2 x bfloat> %src1, <2 x bfloat> poison, <2 x i32> + %src0.ext = fpext <2 x bfloat> %src0.shuf to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1.shuf to <2 x float> + %result = fadd <2 x float> %src0.ext, %src1.ext + ret <2 x float> %result +} + +define float @v_mad_mix_f32_negbf16lo_add_bf16lo(bfloat %src0, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_negbf16lo_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, 1, -v0 op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src0.ext.neg = fneg float %src0.ext + %result = fadd float %src0.ext.neg, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_absbf16lo_add_bf16lo(bfloat %src0, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_absbf16lo_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, 1, v1 op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %result = fadd float %src0.ext.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_negabsbf16lo_add_bf16lo(bfloat %src0, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_negabsbf16lo_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, 1, -|v0| op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src0.ext.neg.abs = fneg float %src0.ext.abs + %result = fadd float %src0.ext.neg.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_add_f32(bfloat %src0, float %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_add_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1, v1 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %result = fadd float %src0.ext, %src1 + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_add_negf32(bfloat %src0, float %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_add_negf32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1, -v1 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.neg = fneg float %src1 + %result = fadd float %src0.ext, %src1.neg + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_add_absf32(bfloat %src0, float %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_add_absf32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1, |v1| op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.abs = call float @llvm.fabs.f32(float %src1) + %result = fadd float %src0.ext, %src1.abs + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_add_negabsf32(bfloat %src0, float %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_add_negabsf32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1, -|v1| op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.abs = call float @llvm.fabs.f32(float %src1) + %src1.neg.abs = fneg float %src1.abs + %result = fadd float %src0.ext, %src1.neg.abs + ret float %result +} + +define float @no_mix_simple_cvt_add(float %src0, float %src1) { +; GFX1250-LABEL: no_mix_simple_cvt_add: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %result = fadd float %src0, %src1 + ret float %result +} + +define float @no_mix_simple_fabs_cvt_add(float %src0, float %src1) { +; GFX1250-LABEL: no_mix_simple_fabs_cvt_add: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.fabs = call float @llvm.fabs.f32(float %src0) + %result = fadd float %src0.fabs, %src1 + ret float %result +} + +define float @v_mad_mix_clamp_f32_bf16hi_add_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1) { +; GFX1250-LABEL: v_mad_mix_clamp_f32_bf16hi_add_bf16hi_elt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] clamp +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.hi = extractelement <2 x bfloat> %src0, i32 1 + %src1.hi = extractelement <2 x bfloat> %src1, i32 1 + %src0.ext = fpext bfloat %src0.hi to float + %src1.ext = fpext bfloat %src1.hi to float + %result = fadd float %src0.ext, %src1.ext + %max = call float @llvm.maxnum.f32(float %result, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + ret float %clamp +} + +define float @v_mad_mix_f32_negprecvtbf16lo_add_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_negprecvtbf16lo_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v1, 1, -v0 op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 0 + %src0.neg = fneg bfloat %src0 + %src0.ext = fpext bfloat %src0.neg to float + %src1.ext = fpext bfloat %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_precvtnegbf16hi_abs_add_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, 1, v1 op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1 + %src0.neg = fneg bfloat %src0 + %src0.ext = fpext bfloat %src0.neg to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src1.ext = fpext bfloat %src1 to float + %result = fadd float %src0.ext.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_precvtabsbf16hi_add_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_precvtabsbf16hi_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1 + %src0.abs = call bfloat @llvm.fabs.bf16(bfloat %src0) + %src0.ext = fpext bfloat %src0.abs to float + %src1.ext = fpext bfloat %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfneg_bf16hi_add_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_preextractfneg_bf16hi_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %fneg = fneg <2 x bfloat> %src0.arg.bc + %src0 = extractelement <2 x bfloat> %fneg, i32 1 + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfabs_bf16hi_add_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_preextractfabs_bf16hi_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc) + %src0 = extractelement <2 x bfloat> %fabs, i32 1 + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfabsfneg_bf16hi_add_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_preextractfabsfneg_bf16hi_add_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -|v0|, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc) + %fneg.fabs = fneg <2 x bfloat> %fabs + %src0 = extractelement <2 x bfloat> %fneg.fabs, i32 1 + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_mul_bf16lo(bfloat %src0, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_bf16hi_mul_bf16hi_int(i32 %src0, i32 %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16hi_mul_bf16hi_int: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.hi = lshr i32 %src0, 16 + %src1.hi = lshr i32 %src1, 16 + %src0.i16 = trunc i32 %src0.hi to i16 + %src1.i16 = trunc i32 %src1.hi to i16 + %src0.fp16 = bitcast i16 %src0.i16 to bfloat + %src1.fp16 = bitcast i16 %src1.i16 to bfloat + %src0.ext = fpext bfloat %src0.fp16 to float + %src1.ext = fpext bfloat %src1.fp16 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_bf16hi_mul_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16hi_mul_bf16hi_elt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.hi = extractelement <2 x bfloat> %src0, i32 1 + %src1.hi = extractelement <2 x bfloat> %src1, i32 1 + %src0.ext = fpext bfloat %src0.hi to float + %src1.ext = fpext bfloat %src1.hi to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define <2 x float> @v_mad_mix_v2f32_cvt_mul(<2 x bfloat> %src0, <2 x bfloat> %src1) { +; GFX1250-LABEL: v_mad_mix_v2f32_cvt_mul: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[4:5] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %result = fmul <2 x float> %src0.ext, %src1.ext + ret <2 x float> %result +} + +define <2 x float> @v_mad_mix_v2f32_shuffle_cvt_mul(<2 x bfloat> %src0, <2 x bfloat> %src1) { +; GFX1250-LABEL: v_mad_mix_v2f32_shuffle_cvt_mul: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v3, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[4:5] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.shuf = shufflevector <2 x bfloat> %src0, <2 x bfloat> poison, <2 x i32> + %src1.shuf = shufflevector <2 x bfloat> %src1, <2 x bfloat> poison, <2 x i32> + %src0.ext = fpext <2 x bfloat> %src0.shuf to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1.shuf to <2 x float> + %result = fmul <2 x float> %src0.ext, %src1.ext + ret <2 x float> %result +} + +define float @v_mad_mix_f32_negbf16lo_mul_bf16lo(bfloat %src0, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_negbf16lo_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src0.ext.neg = fneg float %src0.ext + %result = fmul float %src0.ext.neg, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_absbf16lo_mul_bf16lo(bfloat %src0, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_absbf16lo_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %result = fmul float %src0.ext.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_negabsbf16lo_mul_bf16lo(bfloat %src0, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_negabsbf16lo_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -|v0|, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src0.ext.neg.abs = fneg float %src0.ext.abs + %result = fmul float %src0.ext.neg.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_mul_f32(bfloat %src0, float %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_mul_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, 0 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %result = fmul float %src0.ext, %src1 + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_mul_negf32(bfloat %src0, float %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_mul_negf32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, -v1, 0 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.neg = fneg float %src1 + %result = fmul float %src0.ext, %src1.neg + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_mul_absf32(bfloat %src0, float %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_mul_absf32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, |v1|, 0 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.abs = call float @llvm.fabs.f32(float %src1) + %result = fmul float %src0.ext, %src1.abs + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_mul_negabsf32(bfloat %src0, float %src1) { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_mul_negabsf32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, -|v1|, 0 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.abs = call float @llvm.fabs.f32(float %src1) + %src1.neg.abs = fneg float %src1.abs + %result = fmul float %src0.ext, %src1.neg.abs + ret float %result +} + +define float @no_mix_simple_cvt_mul(float %src0, float %src1) { +; GFX1250-LABEL: no_mix_simple_cvt_mul: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %result = fmul float %src0, %src1 + ret float %result +} + +define float @no_mix_simple_fabs_cvt_mul(float %src0, float %src1) { +; GFX1250-LABEL: no_mix_simple_fabs_cvt_mul: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.fabs = call float @llvm.fabs.f32(float %src0) + %result = fmul float %src0.fabs, %src1 + ret float %result +} + +define float @v_mad_mix_clamp_f32_bf16hi_mul_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1) { +; GFX1250-LABEL: v_mad_mix_clamp_f32_bf16hi_mul_bf16hi_elt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] clamp +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.hi = extractelement <2 x bfloat> %src0, i32 1 + %src1.hi = extractelement <2 x bfloat> %src1, i32 1 + %src0.ext = fpext bfloat %src0.hi to float + %src1.ext = fpext bfloat %src1.hi to float + %result = fmul float %src0.ext, %src1.ext + %max = call float @llvm.maxnum.f32(float %result, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + ret float %clamp +} + +define float @v_mad_mix_f32_negprecvtbf16lo_mul_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_negprecvtbf16lo_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 0 + %src0.neg = fneg bfloat %src0 + %src0.ext = fpext bfloat %src0.neg to float + %src1.ext = fpext bfloat %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_precvtnegbf16hi_abs_mul_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1 + %src0.neg = fneg bfloat %src0 + %src0.ext = fpext bfloat %src0.neg to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src1.ext = fpext bfloat %src1 to float + %result = fmul float %src0.ext.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_precvtabsbf16hi_mul_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_precvtabsbf16hi_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1 + %src0.abs = call bfloat @llvm.fabs.bf16(bfloat %src0) + %src0.ext = fpext bfloat %src0.abs to float + %src1.ext = fpext bfloat %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfneg_bf16hi_mul_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_preextractfneg_bf16hi_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %fneg = fneg <2 x bfloat> %src0.arg.bc + %src0 = extractelement <2 x bfloat> %fneg, i32 1 + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfabs_bf16hi_mul_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_preextractfabs_bf16hi_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc) + %src0 = extractelement <2 x bfloat> %fabs, i32 1 + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfabsfneg_bf16hi_mul_bf16lo(i32 %src0.arg, bfloat %src1) { +; GFX1250-LABEL: v_mad_mix_f32_preextractfabsfneg_bf16hi_mul_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -|v0|, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc) + %fneg.fabs = fneg <2 x bfloat> %fabs + %src0 = extractelement <2 x bfloat> %fneg.fabs, i32 1 + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + declare bfloat @llvm.fabs.bf16(bfloat) #2 declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #2 declare float @llvm.fabs.f32(float) #2 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index a4878539b1c74..5eb1a4649ff98 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -1866,27 +1866,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl } define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, half %src2) #1 { -; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; SDAG-GFX1100-TRUE16: ; %bb.0: -; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l -; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l -; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 -; SDAG-GFX1100-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 -; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; SDAG-GFX1100-FAKE16: ; %bb.0: -; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1 -; SDAG-GFX1100-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 -; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_fma_mix_f32 v0, v2, 1, v0 op_sel_hi:[1,0,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: ; GFX900: ; %bb.0: @@ -1901,11 +1887,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX906-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX906-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX906-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX906-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX906-NEXT: v_fma_mix_f32 v0, v2, 1, v0 op_sel_hi:[1,0,0] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: @@ -1935,17 +1918,6 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, ; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: -; GISEL-GFX1100: ; %bb.0: -; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GISEL-GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 -; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1964,25 +1936,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, } define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, float %src2) #1 { -; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; SDAG-GFX1100-TRUE16: ; %bb.0: -; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l -; SDAG-GFX1100-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 -; SDAG-GFX1100-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 -; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; SDAG-GFX1100-FAKE16: ; %bb.0: -; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SDAG-GFX1100-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-FAKE16-NEXT: v_mul_f32_e32 v0, v0, v1 -; SDAG-GFX1100-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 -; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: ; GFX900: ; %bb.0: @@ -1996,9 +1956,7 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half ; GFX906-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX906-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX906-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel_hi:[1,1,0] ; GFX906-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; @@ -2027,16 +1985,6 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half ; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: -; GISEL-GFX1100: ; %bb.0: -; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 -; GISEL-GFX1100-NEXT: v_add_f32_e32 v0, v0, v2 -; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] -; ; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2658,6 +2606,2915 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, ret float %result } +define float @v_mad_mix_f32_f16lo_add_f16lo(half %src0, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_add_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel_hi:[1,0,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_add_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_add_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel_hi:[1,0,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_add_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_add_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_f16hi_add_f16hi(i32 %src0, i32 %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16hi_add_f16hi: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16hi_add_f16hi: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16hi_add_f16hi: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16hi_add_f16hi: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16hi_add_f16hi: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16hi_add_f16hi: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: s_setpc_b64 s[30:31] + %src0.hi = lshr i32 %src0, 16 + %src1.hi = lshr i32 %src1, 16 + %src0.i16 = trunc i32 %src0.hi to i16 + %src1.i16 = trunc i32 %src1.hi to i16 + %src0.fp16 = bitcast i16 %src0.i16 to half + %src1.fp16 = bitcast i16 %src1.i16 to half + %src0.ext = fpext half %src0.fp16 to float + %src1.ext = fpext half %src1.fp16 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_f16hi_add_f16hi_elt(<2 x half> %src0, <2 x half> %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16hi_add_f16hi_elt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16hi_add_f16hi_elt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16hi_add_f16hi_elt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16hi_add_f16hi_elt: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16hi_add_f16hi_elt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16hi_add_f16hi_elt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_add_f32_e32 v0, v1, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16hi_add_f16hi_elt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.hi = extractelement <2 x half> %src0, i32 1 + %src1.hi = extractelement <2 x half> %src1, i32 1 + %src0.ext = fpext half %src0.hi to float + %src1.ext = fpext half %src1.hi to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define <2 x float> @v_mad_mix_v2f32_cvt_add(<2 x half> %src0, <2 x half> %src1) { +; GFX1100-LABEL: v_mad_mix_v2f32_cvt_add: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v2, v0, 1, v1 op_sel_hi:[1,0,1] +; GFX1100-NEXT: v_fma_mix_f32 v1, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvt_add: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-GFX900-NEXT: v_add_f32_e32 v1, v2, v3 +; SDAG-GFX900-NEXT: v_add_f32_e32 v0, v0, v4 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_v2f32_cvt_add: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v2, v0, 1, v1 op_sel_hi:[1,0,1] +; GFX906-NEXT: v_fma_mix_f32 v1, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_cvt_add: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-GFX9GEN-NEXT: v_add_f32_e32 v1, v2, v3 +; SDAG-GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v4 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_v2f32_cvt_add: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-VI-NEXT: v_add_f32_e32 v1, v2, v3 +; SDAG-VI-NEXT: v_add_f32_e32 v0, v0, v4 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_cvt_add: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2 +; SDAG-CI-NEXT: v_add_f32_e32 v1, v1, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v2f32_cvt_add: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, v2, v0 +; GISEL-GFX900-NEXT: v_add_f32_e32 v1, v3, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32_cvt_add: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_add_f32_e32 v0, v2, v0 +; GISEL-GFX9GEN-NEXT: v_add_f32_e32 v1, v3, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v2f32_cvt_add: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_add_f32_e32 v0, v2, v0 +; GISEL-VI-NEXT: v_add_f32_e32 v1, v3, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_cvt_add: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_add_f32_e32 v1, v1, v3 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %result = fadd <2 x float> %src0.ext, %src1.ext + ret <2 x float> %result +} + +define <2 x float> @v_mad_mix_v2f32_shuffle_cvt_add(<2 x half> %src0, <2 x half> %src1) { +; GFX1100-LABEL: v_mad_mix_v2f32_shuffle_cvt_add: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v2, v0, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX1100-NEXT: v_fma_mix_f32 v1, v0, 1, v1 op_sel:[0,0,1] op_sel_hi:[1,0,1] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v2f32_shuffle_cvt_add: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX900-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_v2f32_shuffle_cvt_add: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v2, v0, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX906-NEXT: v_fma_mix_f32 v1, v0, 1, v1 op_sel:[0,0,1] op_sel_hi:[1,0,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_v2f32_shuffle_cvt_add: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX9GEN-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_shuffle_cvt_add: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_add_f32_e32 v0, v2, v0 +; VI-NEXT: v_add_f32_e32 v1, v3, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_shuffle_cvt_add: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SDAG-CI-NEXT: v_add_f32_e32 v0, v1, v2 +; SDAG-CI-NEXT: v_add_f32_e32 v1, v4, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle_cvt_add: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v1, v0 +; GISEL-CI-NEXT: v_add_f32_e32 v1, v4, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.shuf = shufflevector <2 x half> %src0, <2 x half> poison, <2 x i32> + %src1.shuf = shufflevector <2 x half> %src1, <2 x half> poison, <2 x i32> + %src0.ext = fpext <2 x half> %src0.shuf to <2 x float> + %src1.ext = fpext <2 x half> %src1.shuf to <2 x float> + %result = fadd <2 x float> %src0.ext, %src1.ext + ret <2 x float> %result +} + +define float @v_mad_mix_f32_negf16lo_add_f16lo(half %src0, half %src1) { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1, -v0 op_sel_hi:[1,0,1] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX900-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1, -v0 op_sel_hi:[1,0,1] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX9GEN-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, -v0, 1, v1 op_sel_hi:[1,0,1] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, -v0, 1, v1 op_sel_hi:[1,0,1] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_negf16lo_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src0.ext.neg = fneg float %src0.ext + %result = fadd float %src0.ext.neg, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_absf16lo_add_f16lo(half %src0, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_absf16lo_add_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel_hi:[1,0,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_absf16lo_add_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_absf16lo_add_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel_hi:[1,0,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_absf16lo_add_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_absf16lo_add_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_add_f32_e64 v0, |v0|, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_absf16lo_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_add_f32_e64 v0, |v0|, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_absf16lo_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %result = fadd float %src0.ext.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_negabsf16lo_add_f16lo(half %src0, half %src1) { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1, -|v0| op_sel_hi:[1,0,1] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_sub_f32_e64 v0, v1, |v0| +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1, -|v0| op_sel_hi:[1,0,1] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_sub_f32_e64 v0, v1, |v0| +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_sub_f32_e64 v0, v1, |v0| +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_sub_f32_e64 v0, v1, |v0| +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_sub_f32_e64 v0, v1, |v0| +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-GFX906-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX906-NEXT: v_sub_f32_e64 v0, v1, |v0| +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_negabsf16lo_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_sub_f32_e64 v0, v1, |v0| +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src0.ext.neg.abs = fneg float %src0.ext.abs + %result = fadd float %src0.ext.neg.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_f16lo_add_f32(half %src0, float %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_add_f32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel_hi:[1,0,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_add_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_add_f32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel_hi:[1,0,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_add_f32: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_add_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_add_f32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_add_f32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %result = fadd float %src0.ext, %src1 + ret float %result +} + +define float @v_mad_mix_f32_f16lo_add_negf32(half %src0, float %src1) { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_add_negf32: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1, -v1 op_sel_hi:[1,0,0] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_add_negf32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_add_negf32: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, 1, -v1 op_sel_hi:[1,0,0] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_add_negf32: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_add_negf32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_sub_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_add_negf32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_sub_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_add_negf32: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_add_negf32: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-GFX906-NEXT: v_sub_f32_e32 v0, v0, v1 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_add_negf32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_sub_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.neg = fneg float %src1 + %result = fadd float %src0.ext, %src1.neg + ret float %result +} + +define float @v_mad_mix_f32_f16lo_add_absf32(half %src0, float %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_add_absf32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1, |v1| op_sel_hi:[1,0,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_add_absf32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_add_f32_e64 v0, v0, |v1| +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_add_absf32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1, |v1| op_sel_hi:[1,0,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_add_absf32: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_add_f32_e64 v0, v0, |v1| +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_add_absf32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_add_f32_e64 v0, v0, |v1| +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_add_absf32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_add_f32_e64 v0, v0, |v1| +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_add_absf32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_add_f32_e64 v0, v0, |v1| +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.abs = call float @llvm.fabs.f32(float %src1) + %result = fadd float %src0.ext, %src1.abs + ret float %result +} + +define float @v_mad_mix_f32_f16lo_add_negabsf32(half %src0, float %src1) { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_add_negabsf32: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1, -|v1| op_sel_hi:[1,0,0] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_add_negabsf32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_sub_f32_e64 v0, v0, |v1| +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_f16lo_add_negabsf32: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v0, 1, -|v1| op_sel_hi:[1,0,0] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_add_negabsf32: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_sub_f32_e64 v0, v0, |v1| +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_add_negabsf32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_sub_f32_e64 v0, v0, |v1| +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_add_negabsf32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_sub_f32_e64 v0, v0, |v1| +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_add_negabsf32: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_sub_f32_e64 v0, v0, |v1| +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_f16lo_add_negabsf32: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-GFX906-NEXT: v_sub_f32_e64 v0, v0, |v1| +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_add_negabsf32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_sub_f32_e64 v0, v0, |v1| +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.abs = call float @llvm.fabs.f32(float %src1) + %src1.neg.abs = fneg float %src1.abs + %result = fadd float %src0.ext, %src1.neg.abs + ret float %result +} + +define float @no_mix_simple_cvt_add(float %src0, float %src1) { +; GFX1100-LABEL: no_mix_simple_cvt_add: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: no_mix_simple_cvt_add: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: no_mix_simple_cvt_add: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: no_mix_simple_cvt_add: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: no_mix_simple_cvt_add: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: no_mix_simple_cvt_add: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_f32_e32 v0, v0, v1 +; CI-NEXT: s_setpc_b64 s[30:31] + %result = fadd float %src0, %src1 + ret float %result +} + +define float @no_mix_simple_fabs_cvt_add(float %src0, float %src1) { +; GFX1100-LABEL: no_mix_simple_fabs_cvt_add: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: no_mix_simple_fabs_cvt_add: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: no_mix_simple_fabs_cvt_add: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: no_mix_simple_fabs_cvt_add: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: no_mix_simple_fabs_cvt_add: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_f32_e64 v0, |v0|, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: no_mix_simple_fabs_cvt_add: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_f32_e64 v0, |v0|, v1 +; CI-NEXT: s_setpc_b64 s[30:31] + %src0.fabs = call float @llvm.fabs.f32(float %src0) + %result = fadd float %src0.fabs, %src1 + ret float %result +} + +define float @v_mad_mix_clamp_f32_f16hi_add_f16hi(<2 x half> %src0, <2 x half> %src1) { +; GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_add_f16hi: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] clamp +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_add_f16hi: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_add_f16hi: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, 1, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1] clamp +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_add_f16hi: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_clamp_f32_f16hi_add_f16hi: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_clamp_f32_f16hi_add_f16hi: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_add_f32_e64 v0, v1, v3 clamp +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_clamp_f32_f16hi_add_f16hi: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GISEL-CI-NEXT: v_add_f32_e64 v0, v0, v1 clamp +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.hi = extractelement <2 x half> %src0, i32 1 + %src1.hi = extractelement <2 x half> %src1, i32 1 + %src0.ext = fpext half %src0.hi to float + %src1.ext = fpext half %src1.hi to float + %result = fadd float %src0.ext, %src1.ext + %max = call float @llvm.maxnum.f32(float %result, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + ret float %clamp +} + +define float @v_mad_mix_f32_negprecvtf16lo_add_f16lo(i32 %src0.arg, half %src1) { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1, -v0 op_sel_hi:[1,0,1] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX900-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1, -v0 op_sel_hi:[1,0,1] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX9GEN-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, -v0, 1, v1 op_sel_hi:[1,0,1] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, -v0, 1, v1 op_sel_hi:[1,0,1] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %src0 = extractelement <2 x half> %src0.arg.bc, i32 0 + %src0.neg = fneg half %src0 + %src0.ext = fpext half %src0.neg to float + %src1.ext = fpext half %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo(i32 %src0.arg, half %src1) { +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel_hi:[1,0,1] +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel_hi:[1,0,1] +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0x8000 +; GFX906-NEXT: v_xor_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel_hi:[1,0,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_add_f32_e64 v0, |v0|, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel_hi:[1,0,1] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_add_f32_e64 v0, |v0|, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 + %src0.neg = fneg half %src0 + %src0.ext = fpext half %src0.neg to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src1.ext = fpext half %src1 to float + %result = fadd float %src0.ext.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_precvtabsf16hi_add_f16lo(i32 %src0.arg, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_precvtabsf16hi_add_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_precvtabsf16hi_add_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_precvtabsf16hi_add_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_precvtabsf16hi_add_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_precvtabsf16hi_add_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_add_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 + %src0.abs = call half @llvm.fabs.f16(half %src0) + %src0.ext = fpext half %src0.abs to float + %src1.ext = fpext half %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfneg_f16hi_add_f16lo(i32 %src0.arg, half %src1) { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1, -v0 op_sel:[0,0,1] op_sel_hi:[1,0,1] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX900-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1, -v0 op_sel:[0,0,1] op_sel_hi:[1,0,1] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX9GEN-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, -v0, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, -v0, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %fneg = fneg <2 x half> %src0.arg.bc + %src0 = extractelement <2 x half> %fneg, i32 1 + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfabs_f16hi_add_f16lo(i32 %src0.arg, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) + %src0 = extractelement <2 x half> %fabs, i32 1 + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo(i32 %src0.arg, half %src1) { +; SDAG-GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v1, 1, -|v0| op_sel:[0,0,1] op_sel_hi:[1,0,1] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX900-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX906-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mix_f32 v0, v1, 1, -|v0| op_sel:[0,0,1] op_sel_hi:[1,0,1] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX9GEN-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_sub_f32_e32 v0, v1, v0 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, -|v0|, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,1] +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX9GEN-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_add_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_add_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) + %fneg.fabs = fneg <2 x half> %fabs + %src0 = extractelement <2 x half> %fneg.fabs, i32 1 + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = fadd float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_f16lo_mul_f16lo(half %src0, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_mul_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_mul_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_mul_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_mul_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_f16hi_mul_f16hi_int(i32 %src0, i32 %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_int: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_int: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_int: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_int: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_int: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_int: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: s_setpc_b64 s[30:31] + %src0.hi = lshr i32 %src0, 16 + %src1.hi = lshr i32 %src1, 16 + %src0.i16 = trunc i32 %src0.hi to i16 + %src1.i16 = trunc i32 %src1.hi to i16 + %src0.fp16 = bitcast i16 %src0.i16 to half + %src1.fp16 = bitcast i16 %src1.i16 to half + %src0.ext = fpext half %src0.fp16 to float + %src1.ext = fpext half %src1.fp16 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_f16hi_mul_f16hi_elt(<2 x half> %src0, <2 x half> %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_elt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_elt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_elt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_elt: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_elt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_elt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v1, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16hi_mul_f16hi_elt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.hi = extractelement <2 x half> %src0, i32 1 + %src1.hi = extractelement <2 x half> %src1, i32 1 + %src0.ext = fpext half %src0.hi to float + %src1.ext = fpext half %src1.hi to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define <2 x float> @v_mad_mix_v2f32_cvt_mul(<2 x half> %src0, <2 x half> %src1) { +; GFX1100-LABEL: v_mad_mix_v2f32_cvt_mul: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvt_mul: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-GFX900-NEXT: v_mul_f32_e32 v1, v2, v3 +; SDAG-GFX900-NEXT: v_mul_f32_e32 v0, v0, v4 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_v2f32_cvt_mul: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: v_mov_b32_e32 v0, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_mad_mix_v2f32_cvt_mul: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-GFX9GEN-NEXT: v_mul_f32_e32 v1, v2, v3 +; SDAG-GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v4 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_v2f32_cvt_mul: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v1 +; SDAG-VI-NEXT: v_mul_f32_e32 v1, v2, v3 +; SDAG-VI-NEXT: v_mul_f32_e32 v0, v0, v4 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_cvt_mul: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; SDAG-CI-NEXT: v_mul_f32_e32 v1, v1, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v2f32_cvt_mul: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_mul_f32_e32 v0, v2, v0 +; GISEL-GFX900-NEXT: v_mul_f32_e32 v1, v3, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_v2f32_cvt_mul: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_mul_f32_e32 v0, v2, v0 +; GISEL-GFX9GEN-NEXT: v_mul_f32_e32 v1, v3, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_v2f32_cvt_mul: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_mul_f32_e32 v0, v2, v0 +; GISEL-VI-NEXT: v_mul_f32_e32 v1, v3, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_cvt_mul: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v2 +; GISEL-CI-NEXT: v_mul_f32_e32 v1, v1, v3 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext <2 x half> %src0 to <2 x float> + %src1.ext = fpext <2 x half> %src1 to <2 x float> + %result = fmul <2 x float> %src0.ext, %src1.ext + ret <2 x float> %result +} + +define <2 x float> @v_mad_mix_v2f32_shuffle_cvt_mul(<2 x half> %src0, <2 x half> %src1) { +; GFX1100-LABEL: v_mad_mix_v2f32_shuffle_cvt_mul: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, 0 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-NEXT: v_mov_b32_e32 v0, v2 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_v2f32_shuffle_cvt_mul: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX900-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_v2f32_shuffle_cvt_mul: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v2, v0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, 0 op_sel:[0,1,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: v_mov_b32_e32 v0, v2 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_v2f32_shuffle_cvt_mul: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX9GEN-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_v2f32_shuffle_cvt_mul: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v3, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mul_f32_e32 v0, v2, v0 +; VI-NEXT: v_mul_f32_e32 v1, v3, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_v2f32_shuffle_cvt_mul: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v1, v2 +; SDAG-CI-NEXT: v_mul_f32_e32 v1, v4, v3 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_v2f32_shuffle_cvt_mul: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v2 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v3 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v1, v0 +; GISEL-CI-NEXT: v_mul_f32_e32 v1, v4, v2 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.shuf = shufflevector <2 x half> %src0, <2 x half> poison, <2 x i32> + %src1.shuf = shufflevector <2 x half> %src1, <2 x half> poison, <2 x i32> + %src0.ext = fpext <2 x half> %src0.shuf to <2 x float> + %src1.ext = fpext <2 x half> %src1.shuf to <2 x float> + %result = fmul <2 x float> %src0.ext, %src1.ext + ret <2 x float> %result +} + +define float @v_mad_mix_f32_negf16lo_mul_f16lo(half %src0, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SDAG-GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, 0 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SDAG-GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SDAG-VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e64 v0, -v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_negf16lo_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src0.ext.neg = fneg float %src0.ext + %result = fmul float %src0.ext.neg, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_absf16lo_mul_f16lo(half %src0, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_absf16lo_mul_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_absf16lo_mul_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_absf16lo_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_absf16lo_mul_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_absf16lo_mul_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_absf16lo_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_absf16lo_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %result = fmul float %src0.ext.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_negabsf16lo_mul_f16lo(half %src0, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_negabsf16lo_mul_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, -|v0|, v1, 0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_negabsf16lo_mul_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_negabsf16lo_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, 0 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_negabsf16lo_mul_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_negabsf16lo_mul_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_negabsf16lo_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_negabsf16lo_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e64 v0, -|v0|, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src0.ext.neg.abs = fneg float %src0.ext.abs + %result = fmul float %src0.ext.neg.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_f16lo_mul_f32(half %src0, float %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_mul_f32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel_hi:[1,0,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_mul_f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_mul_f32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel_hi:[1,0,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_mul_f32: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_mul_f32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_mul_f32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_mul_f32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %result = fmul float %src0.ext, %src1 + ret float %result +} + +define float @v_mad_mix_f32_f16lo_mul_negf32(half %src0, float %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_mul_negf32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, -v1, 0 op_sel_hi:[1,0,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_mul_negf32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_mul_f32_e64 v0, v0, -v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_mul_negf32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, -v1, 0 op_sel_hi:[1,0,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_mul_negf32: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_mul_f32_e64 v0, v0, -v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_mul_negf32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_mul_f32_e64 v0, v0, -v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_mul_negf32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e64 v0, v0, -v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_mul_negf32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_mul_f32_e64 v0, v0, -v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.neg = fneg float %src1 + %result = fmul float %src0.ext, %src1.neg + ret float %result +} + +define float @v_mad_mix_f32_f16lo_mul_absf32(half %src0, float %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_mul_absf32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, |v1|, 0 op_sel_hi:[1,0,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_mul_absf32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_mul_f32_e64 v0, v0, |v1| +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_mul_absf32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, |v1|, 0 op_sel_hi:[1,0,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_mul_absf32: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_mul_f32_e64 v0, v0, |v1| +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_mul_absf32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_mul_f32_e64 v0, v0, |v1| +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_mul_absf32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e64 v0, v0, |v1| +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_mul_absf32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_mul_f32_e64 v0, v0, |v1| +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.abs = call float @llvm.fabs.f32(float %src1) + %result = fmul float %src0.ext, %src1.abs + ret float %result +} + +define float @v_mad_mix_f32_f16lo_mul_negabsf32(half %src0, float %src1) { +; GFX1100-LABEL: v_mad_mix_f32_f16lo_mul_negabsf32: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, -|v1|, 0 op_sel_hi:[1,0,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_f16lo_mul_negabsf32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX900-NEXT: v_mul_f32_e64 v0, v0, -|v1| +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_f16lo_mul_negabsf32: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, -|v1|, 0 op_sel_hi:[1,0,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_f16lo_mul_negabsf32: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9GEN-NEXT: v_mul_f32_e64 v0, v0, -|v1| +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_f16lo_mul_negabsf32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; VI-NEXT: v_mul_f32_e64 v0, v0, -|v1| +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_mul_negabsf32: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e64 v0, v0, -|v1| +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_mul_negabsf32: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_mul_f32_e64 v0, v0, -|v1| +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.ext = fpext half %src0 to float + %src1.abs = call float @llvm.fabs.f32(float %src1) + %src1.neg.abs = fneg float %src1.abs + %result = fmul float %src0.ext, %src1.neg.abs + ret float %result +} + +define float @no_mix_simple_cvt_mul(float %src0, float %src1) { +; GFX1100-LABEL: no_mix_simple_cvt_mul: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: no_mix_simple_cvt_mul: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: no_mix_simple_cvt_mul: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: no_mix_simple_cvt_mul: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: no_mix_simple_cvt_mul: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: no_mix_simple_cvt_mul: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; CI-NEXT: s_setpc_b64 s[30:31] + %result = fmul float %src0, %src1 + ret float %result +} + +define float @no_mix_simple_fabs_cvt_mul(float %src0, float %src1) { +; GFX1100-LABEL: no_mix_simple_fabs_cvt_mul: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: no_mix_simple_fabs_cvt_mul: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: no_mix_simple_fabs_cvt_mul: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: no_mix_simple_fabs_cvt_mul: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: no_mix_simple_fabs_cvt_mul: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: no_mix_simple_fabs_cvt_mul: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; CI-NEXT: s_setpc_b64 s[30:31] + %src0.fabs = call float @llvm.fabs.f32(float %src0) + %result = fmul float %src0.fabs, %src1 + ret float %result +} + +define float @v_mad_mix_clamp_f32_f16hi_mul_f16hi_elt(<2 x half> %src0, <2 x half> %src1) { +; GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_mul_f16hi_elt: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] clamp +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_mul_f16hi_elt: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_mul_f32_e64 v0, v0, v1 clamp +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_clamp_f32_f16hi_mul_f16hi_elt: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, 0 op_sel:[1,1,0] op_sel_hi:[1,1,0] clamp +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_clamp_f32_f16hi_mul_f16hi_elt: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_mul_f32_e64 v0, v0, v1 clamp +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_clamp_f32_f16hi_mul_f16hi_elt: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_mul_f32_e64 v0, v0, v1 clamp +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_clamp_f32_f16hi_mul_f16hi_elt: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_mul_f32_e64 v0, v1, v3 clamp +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_clamp_f32_f16hi_mul_f16hi_elt: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; GISEL-CI-NEXT: v_mul_f32_e64 v0, v0, v1 clamp +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.hi = extractelement <2 x half> %src0, i32 1 + %src1.hi = extractelement <2 x half> %src1, i32 1 + %src0.ext = fpext half %src0.hi to float + %src1.ext = fpext half %src1.hi to float + %result = fmul float %src0.ext, %src1.ext + %max = call float @llvm.maxnum.f32(float %result, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + ret float %clamp +} + +define float @v_mad_mix_f32_negprecvtf16lo_mul_f16lo(i32 %src0.arg, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_negprecvtf16lo_mul_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_negprecvtf16lo_mul_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_negprecvtf16lo_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, 0 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_negprecvtf16lo_mul_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_negprecvtf16lo_mul_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %src0 = extractelement <2 x half> %src0.arg.bc, i32 0 + %src0.neg = fneg half %src0 + %src0.ext = fpext half %src0.neg to float + %src1.ext = fpext half %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo(i32 %src0.arg, half %src1) { +; SDAG-GFX1100-TRUE16-LABEL: v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel_hi:[1,1,0] +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: s_mov_b32 s4, 0x8000 +; GFX906-NEXT: v_xor_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX1100-LABEL: v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel_hi:[1,1,0] +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_fma_mix_f32_precvtnegf16hi_abs_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e64 v0, |v0|, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 + %src0.neg = fneg half %src0 + %src0.ext = fpext half %src0.neg to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src1.ext = fpext half %src1 to float + %result = fmul float %src0.ext.abs, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_precvtabsf16hi_mul_f16lo(i32 %src0.arg, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_precvtabsf16hi_mul_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_mad_mix_f32_precvtabsf16hi_mul_f16lo: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_precvtabsf16hi_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; GFX9GEN-LABEL: v_mad_mix_f32_precvtabsf16hi_mul_f16lo: +; GFX9GEN: ; %bb.0: +; GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_mad_mix_f32_precvtabsf16hi_mul_f16lo: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %src0 = extractelement <2 x half> %src0.arg.bc, i32 1 + %src0.abs = call half @llvm.fabs.f16(half %src0) + %src0.ext = fpext half %src0.abs to float + %src1.ext = fpext half %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo(i32 %src0.arg, half %src1) { +; GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -v0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %fneg = fneg <2 x half> %src0.arg.bc + %src0 = extractelement <2 x half> %fneg, i32 1 + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo(i32 %src0.arg, half %src1) { +; GFX1100-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, |v0|, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_fma_mix_f32_preextractfabs_f16hi_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) + %src0 = extractelement <2 x half> %fabs, i32 1 + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + +define float @v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo(i32 %src0.arg, half %src1) { +; GFX1100-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: v_fma_mix_f32 v0, -|v0|, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX900-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX906-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; GFX906: ; %bb.0: +; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX906-NEXT: v_fma_mix_f32 v0, -|v0|, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,1,0] +; GFX906-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX9GEN-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; SDAG-GFX9GEN: ; %bb.0: +; SDAG-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-VI-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; SDAG-VI: ; %bb.0: +; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-VI-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-CI-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; SDAG-CI: ; %bb.0: +; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| +; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; SDAG-CI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX9GEN-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; GISEL-GFX9GEN: ; %bb.0: +; GISEL-GFX9GEN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX9GEN-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-GFX9GEN-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-GFX9GEN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-GFX9GEN-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-VI-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; GISEL-VI: ; %bb.0: +; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-VI-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-VI-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GISEL-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-VI-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-CI-LABEL: v_fma_mix_f32_preextractfabsfneg_f16hi_mul_f16lo: +; GISEL-CI: ; %bb.0: +; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-CI-NEXT: v_or_b32_e32 v0, 0x80008000, v0 +; GISEL-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GISEL-CI-NEXT: v_mul_f32_e32 v0, v0, v1 +; GISEL-CI-NEXT: s_setpc_b64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x half> + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %src0.arg.bc) + %fneg.fabs = fneg <2 x half> %fabs + %src0 = extractelement <2 x half> %fneg.fabs, i32 1 + %src0.ext = fpext half %src0 to float + %src1.ext = fpext half %src1 to float + %result = fmul float %src0.ext, %src1.ext + ret float %result +} + + declare half @llvm.fabs.f16(half) #2 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2 declare float @llvm.fabs.f32(float) #2