Skip to content

Commit

Permalink
[AArch64] Add patterns for scalar FMUL, FMULX
Browse files Browse the repository at this point in the history
Scalar FMUL, FMULX instructions perform better or the same compared to indexed
FMUL, FMULX.

For example, the Arm Cortex-A55 Software Optimization Guide lists the following
instructions with a throughput of 2 IPC:
 - "FP multiply" FMUL
 - "ASIMD FP multiply" FMULX

whereas it lists the following with a throughput of 1 IPC:
 - "ASIMD FP multiply, by element" FMUL, FMULX

The Arm Cortex-A510 Software Optimization Guide, however, does not separately
list "by element" variants of the "ASIMD FP multiply" instructions, which are
listed with the same throughput as the non-ASIMD ones.

Fixes #60817.

Differential Revision: https://reviews.llvm.org/D153207
  • Loading branch information
overmighty authored and davemgreen committed Jun 30, 2023
1 parent 0446bfc commit ea045b9
Show file tree
Hide file tree
Showing 9 changed files with 267 additions and 39 deletions.
25 changes: 24 additions & 1 deletion llvm/lib/Target/AArch64/AArch64InstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -8427,9 +8427,9 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
V128, v4f32, v8f16, OpNode>;
}

let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
SDPatternOperator OpNode> {
let mayRaiseFPException = 1, Uses = [FPCR] in {
let Predicates = [HasNEON, HasFullFP16] in {
def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc,
V64, V64,
Expand Down Expand Up @@ -8532,6 +8532,29 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
let Inst{11} = idx{0};
let Inst{21} = 0;
}
} // mayRaiseFPException = 1, Uses = [FPCR]

let Predicates = [HasNEON, HasFullFP16] in {
def : Pat<(f16 (OpNode
(f16 (vector_extract (v8f16 V128:$Rn), (i64 0))),
(f16 (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx)))),
(!cast<Instruction>(NAME # v1i16_indexed)
(EXTRACT_SUBREG V128:$Rn, hsub), V128:$Rm, VectorIndexH:$idx)>;
}

let Predicates = [HasNEON] in {
def : Pat<(f32 (OpNode
(f32 (vector_extract (v4f32 V128:$Rn), (i64 0))),
(f32 (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx)))),
(!cast<Instruction>(NAME # v1i32_indexed)
(EXTRACT_SUBREG V128:$Rn, ssub), V128:$Rm, VectorIndexS:$idx)>;

def : Pat<(f64 (OpNode
(f64 (vector_extract (v2f64 V128:$Rn), (i64 0))),
(f64 (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx)))),
(!cast<Instruction>(NAME # v1i64_indexed)
(EXTRACT_SUBREG V128:$Rn, dsub), V128:$Rm, VectorIndexD:$idx)>;
}
}

multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
Expand Down
31 changes: 31 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -4443,6 +4443,33 @@ defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", any_fmul>;
}
defm FSUB : TwoOperandFPData<0b0011, "fsub", any_fsub>;

multiclass FMULScalarFromIndexedLane0Patterns<string inst,
string inst_f16_suffix,
string inst_f32_suffix,
string inst_f64_suffix,
SDPatternOperator OpNode,
list<Predicate> preds = []> {
let Predicates = !listconcat(preds, [HasFullFP16]) in {
def : Pat<(f16 (OpNode (f16 FPR16:$Rn),
(f16 (vector_extract (v8f16 V128:$Rm), (i64 0))))),
(!cast<Instruction>(inst # inst_f16_suffix)
FPR16:$Rn, (EXTRACT_SUBREG V128:$Rm, hsub))>;
}
let Predicates = preds in {
def : Pat<(f32 (OpNode (f32 FPR32:$Rn),
(f32 (vector_extract (v4f32 V128:$Rm), (i64 0))))),
(!cast<Instruction>(inst # inst_f32_suffix)
FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub))>;
def : Pat<(f64 (OpNode (f64 FPR64:$Rn),
(f64 (vector_extract (v2f64 V128:$Rm), (i64 0))))),
(!cast<Instruction>(inst # inst_f64_suffix)
FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub))>;
}
}

defm : FMULScalarFromIndexedLane0Patterns<"FMUL", "Hrr", "Srr", "Drr",
any_fmul>;

// Match reassociated forms of FNMUL.
def : Pat<(fmul (fneg FPR16:$a), (f16 FPR16:$b)),
(FNMULHrr FPR16:$a, FPR16:$b)>,
Expand Down Expand Up @@ -5248,6 +5275,10 @@ let Predicates = [HasRDM] in {
(SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
}

defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64",
int_aarch64_neon_fmulx,
[HasNEONorSME]>;

def : InstAlias<"cmls $dst, $src1, $src2",
(CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
def : InstAlias<"cmle $dst, $src1, $src2",
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ entry:
; CHECK-LABEL: %for.body
; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
Expand Down Expand Up @@ -59,7 +59,7 @@ entry:
; CHECK-LABEL: %for.body
; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64-fml-combines.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ entry:
; CHECK-LABEL: %for.body
; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0]
; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1
Expand Down Expand Up @@ -52,7 +52,7 @@ entry:
; CHECK-LABEL: %for.body
; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0]
; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
Expand Down
43 changes: 42 additions & 1 deletion llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)

declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)

declare double @llvm.aarch64.neon.fmulx.f64(double, double)

declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32)
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32)
Expand Down Expand Up @@ -2066,6 +2068,19 @@ entry:
ret <4 x float> %vmulx2.i
}

define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %v) {
; CHECK-LABEL: test_vmulx_lane_f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmulx d0, d0, d1
; CHECK-NEXT: ret
entry:
%vget_lane = extractelement <1 x double> %a, i64 0
%vget_lane3 = extractelement <1 x double> %v, i64 0
%vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vget_lane3)
%vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0
ret <1 x double> %vset_lane
}

define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {
; CHECK-LABEL: test_vmulxq_lane_f64:
; CHECK: // %bb.0: // %entry
Expand Down Expand Up @@ -2100,6 +2115,19 @@ entry:
ret <4 x float> %vmulx2.i
}

define <1 x double> @test_vmulx_laneq_f64(<1 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulx_laneq_f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmulx d0, d0, v1.d[1]
; CHECK-NEXT: ret
entry:
%vget_lane = extractelement <1 x double> %a, i64 0
%vgetq_lane = extractelement <2 x double> %v, i64 1
%vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vgetq_lane)
%vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0
ret <1 x double> %vset_lane
}

define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulxq_laneq_f64:
; CHECK: // %bb.0: // %entry
Expand Down Expand Up @@ -3560,7 +3588,7 @@ entry:
define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmul_laneq_f64_0:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmul d0, d0, v1.d[0]
; CHECK-NEXT: fmul d0, d0, d1
; CHECK-NEXT: ret
entry:
%0 = bitcast <1 x double> %a to <8 x i8>
Expand Down Expand Up @@ -3651,6 +3679,19 @@ entry:
ret <4 x float> %vmulx2.i
}

define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulx_laneq_f64_0:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmulx d0, d0, d1
; CHECK-NEXT: ret
entry:
%vget_lane = extractelement <1 x double> %a, i64 0
%vgetq_lane = extractelement <2 x double> %v, i64 0
%vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vgetq_lane)
%vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0
ret <1 x double> %vset_lane
}

define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {
; CHECK-LABEL: test_vmulxq_laneq_f64_0:
; CHECK: // %bb.0: // %entry
Expand Down
112 changes: 93 additions & 19 deletions llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s

define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss2S:
define float @test_fmul_lane_ss2S_0(float %a, <2 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss2S_0:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmul s0, s0, s1
; CHECK-NEXT: ret
%tmp1 = extractelement <2 x float> %v, i32 0
%tmp2 = fmul float %a, %tmp1
ret float %tmp2
}

define float @test_fmul_lane_ss2S_1(float %a, <2 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss2S_1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmul s0, s0, v1.s[1]
Expand All @@ -12,8 +23,8 @@ define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
ret float %tmp2;
}

define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss2S_swap:
define float @test_fmul_lane_ss2S_1_swap(float %a, <2 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss2S_1_swap:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmul s0, s0, v1.s[1]
Expand All @@ -23,9 +34,18 @@ define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) {
ret float %tmp2;
}

define float @test_fmul_lane_ss4S_0(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss4S_0:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul s0, s0, s1
; CHECK-NEXT: ret
%tmp1 = extractelement <4 x float> %v, i32 0
%tmp2 = fmul float %a, %tmp1
ret float %tmp2
}

define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss4S:
define float @test_fmul_lane_ss4S_3(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss4S_3:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul s0, s0, v1.s[3]
; CHECK-NEXT: ret
Expand All @@ -34,8 +54,8 @@ define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) {
ret float %tmp2;
}

define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss4S_swap:
define float @test_fmul_lane_ss4S_3_swap(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmul_lane_ss4S_3_swap:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul s0, s0, v1.s[3]
; CHECK-NEXT: ret
Expand All @@ -56,9 +76,18 @@ define double @test_fmul_lane_ddD(double %a, <1 x double> %v) {
}


define double @test_fmul_lane_dd2D_0(double %a, <2 x double> %v) {
; CHECK-LABEL: test_fmul_lane_dd2D_0:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul d0, d0, d1
; CHECK-NEXT: ret
%tmp1 = extractelement <2 x double> %v, i32 0
%tmp2 = fmul double %a, %tmp1
ret double %tmp2
}

define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
; CHECK-LABEL: test_fmul_lane_dd2D:
define double @test_fmul_lane_dd2D_1(double %a, <2 x double> %v) {
; CHECK-LABEL: test_fmul_lane_dd2D_1:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul d0, d0, v1.d[1]
; CHECK-NEXT: ret
Expand All @@ -68,8 +97,8 @@ define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) {
}


define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {
; CHECK-LABEL: test_fmul_lane_dd2D_swap:
define double @test_fmul_lane_dd2D_1_swap(double %a, <2 x double> %v) {
; CHECK-LABEL: test_fmul_lane_dd2D_1_swap:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul d0, d0, v1.d[1]
; CHECK-NEXT: ret
Expand All @@ -80,8 +109,19 @@ define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) {

declare float @llvm.aarch64.neon.fmulx.f32(float, float)

define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
; CHECK-LABEL: test_fmulx_lane_f32:
define float @test_fmulx_lane_f32_0(float %a, <2 x float> %v) {
; CHECK-LABEL: test_fmulx_lane_f32_0:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmulx s0, s0, s1
; CHECK-NEXT: ret
%tmp1 = extractelement <2 x float> %v, i32 0
%tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
ret float %tmp2;
}

define float @test_fmulx_lane_f32_1(float %a, <2 x float> %v) {
; CHECK-LABEL: test_fmulx_lane_f32_1:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: fmulx s0, s0, v1.s[1]
Expand All @@ -91,8 +131,18 @@ define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
ret float %tmp2;
}

define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmulx_laneq_f32:
define float @test_fmulx_laneq_f32_0(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmulx_laneq_f32_0:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx s0, s0, s1
; CHECK-NEXT: ret
%tmp1 = extractelement <4 x float> %v, i32 0
%tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
ret float %tmp2;
}

define float @test_fmulx_laneq_f32_3(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmulx_laneq_f32_3:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx s0, s0, v1.s[3]
; CHECK-NEXT: ret
Expand All @@ -101,8 +151,8 @@ define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) {
ret float %tmp2;
}

define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmulx_laneq_f32_swap:
define float @test_fmulx_laneq_f32_3_swap(float %a, <4 x float> %v) {
; CHECK-LABEL: test_fmulx_laneq_f32_3_swap:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx s0, s0, v1.s[3]
; CHECK-NEXT: ret
Expand All @@ -126,7 +176,7 @@ define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) {
; CHECK-LABEL: test_fmulx_laneq_f64_0:
; CHECK: // %bb.0:
; CHECK-NEXT: fmulx d0, d0, v1.d[0]
; CHECK-NEXT: fmulx d0, d0, d1
; CHECK-NEXT: ret
%tmp1 = extractelement <2 x double> %v, i32 0
%tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
Expand Down Expand Up @@ -154,3 +204,27 @@ define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) {
ret double %tmp2;
}

define float @test_fmulx_horizontal_f32(<2 x float> %v) {
; CHECK-LABEL: test_fmulx_horizontal_f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmulx s0, s0, v0.s[1]
; CHECK-NEXT: ret
entry:
%0 = extractelement <2 x float> %v, i32 0
%1 = extractelement <2 x float> %v, i32 1
%2 = call float @llvm.aarch64.neon.fmulx.f32(float %0, float %1)
ret float %2
}

define double @test_fmulx_horizontal_f64(<2 x double> %v) {
; CHECK-LABEL: test_fmulx_horizontal_f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmulx d0, d0, v0.d[1]
; CHECK-NEXT: ret
entry:
%0 = extractelement <2 x double> %v, i32 0
%1 = extractelement <2 x double> %v, i32 1
%2 = call double @llvm.aarch64.neon.fmulx.f64(double %0, double %1)
ret double %2
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov h3, v0.h[1]
; CHECK-NEXT: mov h2, v1.h[1]
; CHECK-NEXT: fmul h4, h2, v0.h[0]
; CHECK-NEXT: fmul h4, h0, v1.h[1]
; CHECK-NEXT: fnmul h2, h3, h2
; CHECK-NEXT: fmla h4, h3, v1.h[0]
; CHECK-NEXT: fmla h2, h0, v1.h[0]
Expand Down

0 comments on commit ea045b9

Please sign in to comment.