-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LLVM] Combine v_cvt_f32_f16 and v_add_f32/v_mul_f32 into v_fma_mix_f32 #160151
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Acim Maravic (Acim-Maravic) ChangesPatch is 377.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160151.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index d4210b8bc9a87..9654a6e1fbd5f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -4127,6 +4127,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16);
+ Mods ^= SISrcMods::NEG;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16);
+ Mods ^= SISrcMods::NEG;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
// Match BITOP3 operation and return a number of matched instructions plus
// truth table.
static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 4fa0d3f72e1c7..b122b5cd310b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -272,6 +272,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src,
SDValue &SrcMods) const;
+ bool SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+
+ bool SelectVOP3PMadMixBF16ModsNeg(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
+
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
SDValue &Tbl) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index fb2cd04b364d7..af6d4ff319fd9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1704,6 +1704,8 @@ def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
def VOP3PMadMixBF16ModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsExt">;
def VOP3PMadMixBF16Mods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16Mods">;
+def VOP3PMadMixModsNeg : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsNeg">;
+def VOP3PMadMixBF16ModsNeg : ComplexPattern<untyped, 2, "SelectVOP3PMadMixBF16ModsNeg">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f7279b664ed27..5ea8dbe9a1b7f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -168,6 +168,7 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
ValueType vecVT = v2f16> {
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
+ defvar VOP3PMadMixModsNegPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsNeg, VOP3PMadMixModsNeg);
// At least one of the operands needs to be an fpextend of an f16
// for this to be worthwhile, so we need three patterns here.
// TODO: Could we use a predicate to inspect src1/2/3 instead?
@@ -190,6 +191,30 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
DSTCLAMP.NONE)>;
+ def : GCNPat <
+ (f32 (fadd (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1,
+ DSTCLAMP.NONE)>;
+
+ def : GCNPat <
+ (f32 (fmul (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, $src1_mods, $src1, (i32 0), (i32 0),
+ DSTCLAMP.NONE)>;
+
+ def : GCNPat <
+ (f32 (fsub (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsNegPat f32:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1,
+ DSTCLAMP.NONE)>;
+
+ def : GCNPat <
+ (f32 (fsub (f32 (VOP3PMadMixModsNegPat f32:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)))),
+ (mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1,
+ DSTCLAMP.NONE)>;
+
def : GCNPat <
(AMDGPUclamp (build_vector
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
index b2b433167fe4d..f9b63ef8e96e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-fma.ll
@@ -53,16 +53,14 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z,
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-NEXT: v_fmac_f16_e32 v3, v0, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_fma_mix_f32 v0, v3, 1, v2 op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v0, v1
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v3, 1, v2 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul:
@@ -70,8 +68,7 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul(half %x, half %y, float %z,
; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-DENORM-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-DENORM-NEXT: v_add_f16_e32 v0, v0, v3
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v0, 1, v2 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul half %u, %v
@@ -129,16 +126,14 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v3
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v1, v3
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v3, 1, v0 op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_f16_f32_add_ext_fma_mul_rhs:
@@ -146,8 +141,7 @@ define amdgpu_vs float @test_f16_f32_add_ext_fma_mul_rhs(float %x, half %y, half
; GFX10-DENORM-NEXT: v_mul_f16_e32 v3, v3, v4
; GFX10-DENORM-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-DENORM-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v1, 1, v0 op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul half %u, %v
@@ -230,48 +224,36 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul(<4 x half> %x, <4
; GFX9-DENORM-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
-; GFX9-DENORM-NEXT: v_pk_add_f16 v1, v1, v9
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v2, v4
-; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v3, v5
-; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v8, v6
-; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v9, v7
+; GFX9-DENORM-NEXT: v_pk_add_f16 v2, v0, v8
+; GFX9-DENORM-NEXT: v_pk_add_f16 v3, v1, v9
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: v_pk_mul_f16 v8, v8, v10
; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11
-; GFX10-NEXT: v_pk_fma_f16 v0, v0, v2, v8
-; GFX10-NEXT: v_pk_fma_f16 v1, v1, v3, v9
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v8, v1
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_add_f32_e32 v0, v2, v4
-; GFX10-NEXT: v_add_f32_e32 v1, v3, v5
-; GFX10-NEXT: v_add_f32_e32 v2, v8, v6
-; GFX10-NEXT: v_add_f32_e32 v3, v9, v7
+; GFX10-NEXT: v_pk_fma_f16 v2, v0, v2, v8
+; GFX10-NEXT: v_pk_fma_f16 v3, v1, v3, v9
+; GFX10-NEXT: v_fma_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
; GFX10-CONTRACT: ; %bb.0: ; %.entry
; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v8, v8, v10
; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11
-; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v8
-; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v3, v9
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v8, v1
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v2, v4
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v3, v5
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v8, v6
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v9, v7
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v2, v0, v2, v8
+; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v3, v1, v3, v9
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v2, 1, v4 op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v2, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v3, 1, v6 op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v3, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul:
@@ -280,16 +262,12 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul(<4 x half> %x, <4
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v9, v11
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v0, v8
-; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v1, v2
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v1
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v2, v4
-; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v3, v5
-; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v8, v6
-; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v9, v7
+; GFX10-DENORM-NEXT: v_pk_add_f16 v3, v0, v8
+; GFX10-DENORM-NEXT: v_pk_add_f16 v8, v1, v2
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v3, 1, v4 op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v3, 1, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v8, 1, v6 op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v8, 1, v7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul <4 x half> %u, %v
@@ -374,14 +352,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX9-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
; GFX9-DENORM-NEXT: v_pk_add_f16 v4, v4, v8
; GFX9-DENORM-NEXT: v_pk_add_f16 v5, v5, v9
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX9-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-DENORM-NEXT: v_add_f32_e32 v0, v0, v6
-; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-DENORM-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-DENORM-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
+; GFX9-DENORM-NEXT: v_mad_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
@@ -390,14 +364,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX10-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX10-NEXT: v_pk_fma_f16 v4, v4, v6, v8
; GFX10-NEXT: v_pk_fma_f16 v5, v5, v7, v9
-; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v4
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX10-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v6
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX10-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX10-CONTRACT-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
@@ -406,14 +376,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX10-CONTRACT-NEXT: v_pk_mul_f16 v9, v9, v11
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v4, v4, v6, v8
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v5, v5, v7, v9
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v6, v4
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX10-CONTRACT-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v0, v0, v6
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-CONTRACT-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
+; GFX10-CONTRACT-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-CONTRACT-NEXT: ; return to shader part epilog
;
; GFX10-DENORM-LABEL: test_v4f16_v4f32_add_ext_fma_mul_rhs:
@@ -424,14 +390,10 @@ define amdgpu_vs <4 x float> @test_v4f16_v4f32_add_ext_fma_mul_rhs(<4 x float> %
; GFX10-DENORM-NEXT: v_pk_mul_f16 v5, v5, v7
; GFX10-DENORM-NEXT: v_pk_add_f16 v4, v4, v8
; GFX10-DENORM-NEXT: v_pk_add_f16 v5, v5, v6
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v6, v4
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-DENORM-NEXT: v_add_f32_e32 v0, v0, v6
-; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX10-DENORM-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-DENORM-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v0, v4, 1, v0 op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v1, v4, 1, v1 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v2, v5, 1, v2 op_sel_hi:[1,0,0]
+; GFX10-DENORM-NEXT: v_fma_mix_f32 v3, v5, 1, v3 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-DENORM-NEXT: ; return to shader part epilog
.entry:
%a = fmul <4 x half> %u, %v
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
index 4d603f7487754..26f8e41c9351a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
@@ -49,21 +49,16 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s0, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v3, s1, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v4, s2, v0
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v0, v1, 1, s6 op_sel_hi:[1,0,0]
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v1, v1, 1, s7 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v2, v3, 1, s8 op_sel_hi:[1,0,0]
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v3, v3, 1, s9 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FAST-DENORM-NEXT: v_mad_mix_f32 v4, v4, 1, s10 op_sel_hi:[1,0,0]
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-FAST-DENORM-LABEL: test_5x...
[truncated]
|
Complex pattern for BF16 does not work for GlobalISel. |
def : GCNPat < | ||
(f32 (fadd (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), | ||
(f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)))), | ||
(mix_inst $src0_mods, $src0, (i32 0), (i32 1), $src1_mods, $src1, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This needs to use the f32 value 1.0 i.e. 3f800000, not just "1". Same below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can refer to this in patterns as CONST.FP32_ONE
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you rephrase the title, it makes it sound like this is a machine combine when it isn't
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsNeg(SDValue In, SDValue &Src, | ||
SDValue &SrcMods) const { | ||
unsigned Mods = 0; | ||
SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you implement the globalisel equivalent
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A GlobalISel equivalent for this pattern is not necessary. In SelectionDAG, DAGCombiner folds (fadd y, (fneg x)) into (fsub y, x), before the main instruction selection matching occurs. An alternative could be to modify the DAGCombiner to avoid folding fadd (fneg x) into fsub when the target has FMA support and at least one of the operands is an fpextend
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That sounds more like another missing combine in globalisel
; GFX9-F32DENORM-NEXT: v_cvt_f32_f16_e32 v0, v0 | ||
; GFX9-F32DENORM-NEXT: v_sub_f32_e32 v0, v0, v2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why didn't this generate v_mad_mix_f32 ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't support denormals, this is the ieee mode run line
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there any tests for when both operands of an add/sub/mul are extended?
bump |
No description provided.