Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4127,6 +4127,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
return true;
}

bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you implement the globalisel equivalent

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A GlobalISel equivalent for this pattern is not necessary. In SelectionDAG, DAGCombiner folds (fadd y, (fneg x)) into (fsub y, x), before the main instruction selection matching occurs. An alternative could be to modify the DAGCombiner to avoid folding fadd (fneg x) into fsub when the target has FMA support and at least one of the operands is an fpextend
?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds more like another missing combine in globalisel

Mods ^= SISrcMods::NEG;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}

bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
Mods ^= SISrcMods::NEG;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}

// Match BITOP3 operation and return a number of matched instructions plus
// truth table.
static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
SDValue &SrcMods) const;

bool SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src,
SDValue &SrcMods) const;

bool SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src,
SDValue &SrcMods) const;

bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;

Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1704,6 +1704,8 @@ def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
def VOP3PMadMixModsNeg : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsNeg">;
def VOP3PMadMixBF16ModsNeg : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsNeg">;

def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
Expand Down
25 changes: 25 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
ValueType vecVT = v2f16> {
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
defvar VOP3PMadMixModsNegPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsNeg, VOP3PMadMixModsNeg);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
Expand All @@ -190,6 +191,30 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;

def : GCNPat <
(f32 (fadd (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
(mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to use the f32 value 1.0 i.e. 3f800000, not just "1". Same below.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can refer to this in patterns as CONST.FP32_ONE.

DSTCLAMP.NONE)>;

def : GCNPat <
(f32 (fmul (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
(mix_inst $src0_mods, $src0, $src1_mods, $src1, (i32 0), (i32 0),
DSTCLAMP.NONE)>;

def : GCNPat <
(f32 (fsub (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
(f32 (VOP3PMadMixModsNegPat f32:$src1, i32:$src1_mods)))),
(mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1,
DSTCLAMP.NONE)>;

def : GCNPat <
(f32 (fsub (f32 (VOP3PMadMixModsNegPat f32:$src0, i32:$src0_mods)),
(f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)))),
(mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1,
DSTCLAMP.NONE)>;

def : GCNPat <
(AMDGPUclamp (build_vector
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
Expand Down
130 changes: 46 additions & 84 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
Original file line number Diff line number Diff line change
Expand Up @@ -53,25 +53,22 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z,
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-NEXT: v_fmac_f16_e32 v3, v0, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_fma_mix_f32 v0, v3, 1, v2 op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v0, v1
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v3, 1, v2 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v3
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, 1, v2 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul half %u, %v
Expand Down Expand Up @@ -129,25 +126,22 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-NEXT: v_fmac_f16_e32 v3, v1, v2
; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v3
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v1, v3
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-DENORM-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-DENORM-NEXT: v_add_f16_e32 v1, v1, v3
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v1, 1, v0 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul half %u, %v
Expand Down Expand Up @@ -230,48 +224,36 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul(<4 x half> %x, <4
; GFX9-DENORM-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v9
; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1
; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v2, v4
; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v3, v5
; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v8, v6
; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v9, v7
; GFX9-DENORM-NEXT: v_pk_add_f16 v2, v0, v8
; GFX9-DENORM-NEXT: v_pk_add_f16 v3, v1, v9
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_pk_mul_f16 v8, v8, v10
; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v8
; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v9
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_f16_e32 v8, v1
; GFX10-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_add_f32_e32 v0, v2, v4
; GFX10-NEXT: v_add_f32_e32 v1, v3, v5
; GFX10-NEXT: v_add_f32_e32 v2, v8, v6
; GFX10-NEXT: v_add_f32_e32 v3, v9, v7
; GFX10-NEXT: v_pk_fma_f16 v2, v0, v2, v8
; GFX10-NEXT: v_pk_fma_f16 v3, v1, v3, v9
; GFX10-NEXT: v_fma_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0]
; GFX10-NEXT: v_fma_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: v_fma_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0]
; GFX10-NEXT: v_fma_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v8, v8, v10
; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v8
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v9
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v8, v1
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v2, v4
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v3, v5
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v8, v6
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v9, v7
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v2, v0, v2, v8
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v3, v1, v3, v9
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
Expand All @@ -280,16 +262,12 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul(<4 x half> %x, <4
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v9, v11
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v2
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v2, v4
; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v3, v5
; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v8, v6
; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v9, v7
; GFX10-DENORM-NEXT: v_pk_add_f16 v3, v0, v8
; GFX10-DENORM-NEXT: v_pk_add_f16 v8, v1, v2
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v3, 1, v4 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v3, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v8, 1, v6 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v8, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul <4 x half> %u, %v
Expand Down Expand Up @@ -374,14 +352,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
; GFX9-DENORM-NEXT: v_pk_add_f16 v4, v4, v8
; GFX9-DENORM-NEXT: v_pk_add_f16 v5, v5, v9
; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4
; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v6
; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v4
; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v7
; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
Expand All @@ -390,14 +364,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX10-NEXT: v_pk_fma_f16 v4, v4, v6, v8
; GFX10-NEXT: v_pk_fma_f16 v5, v5, v7, v9
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v4
; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX10-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_add_f32_e32 v0, v0, v6
; GFX10-NEXT: v_add_f32_e32 v1, v1, v4
; GFX10-NEXT: v_add_f32_e32 v2, v2, v7
; GFX10-NEXT: v_add_f32_e32 v3, v3, v5
; GFX10-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
; GFX10-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
; GFX10-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
Expand All @@ -406,14 +376,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v6, v4
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v6
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v4
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v7
; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v5
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
Expand All @@ -424,14 +390,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX10-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
; GFX10-DENORM-NEXT: v_pk_add_f16 v4, v4, v8
; GFX10-DENORM-NEXT: v_pk_add_f16 v5, v5, v6
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v6
; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v4
; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v7
; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v5
; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul <4 x half> %u, %v
Expand Down
Loading