diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 362bade748828..f16bb0b33b872 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -8427,9 +8427,9 @@ multiclass SIMDThreeSameVectorFMLIndex opc, string asm, V128, v4f32, v8f16, OpNode>; } -let mayRaiseFPException = 1, Uses = [FPCR] in multiclass SIMDFPIndexed opc, string asm, SDPatternOperator OpNode> { + let mayRaiseFPException = 1, Uses = [FPCR] in { let Predicates = [HasNEON, HasFullFP16] in { def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc, V64, V64, @@ -8532,6 +8532,29 @@ multiclass SIMDFPIndexed opc, string asm, let Inst{11} = idx{0}; let Inst{21} = 0; } + } // mayRaiseFPException = 1, Uses = [FPCR] + + let Predicates = [HasNEON, HasFullFP16] in { + def : Pat<(f16 (OpNode + (f16 (vector_extract (v8f16 V128:$Rn), (i64 0))), + (f16 (vector_extract (v8f16 V128:$Rm), VectorIndexH:$idx)))), + (!cast(NAME # v1i16_indexed) + (EXTRACT_SUBREG V128:$Rn, hsub), V128:$Rm, VectorIndexH:$idx)>; + } + + let Predicates = [HasNEON] in { + def : Pat<(f32 (OpNode + (f32 (vector_extract (v4f32 V128:$Rn), (i64 0))), + (f32 (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx)))), + (!cast(NAME # v1i32_indexed) + (EXTRACT_SUBREG V128:$Rn, ssub), V128:$Rm, VectorIndexS:$idx)>; + + def : Pat<(f64 (OpNode + (f64 (vector_extract (v2f64 V128:$Rn), (i64 0))), + (f64 (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx)))), + (!cast(NAME # v1i64_indexed) + (EXTRACT_SUBREG V128:$Rn, dsub), V128:$Rm, VectorIndexD:$idx)>; + } } multiclass SIMDFPIndexedTiedPatterns { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ec5f840e857e2..7199b80826d3a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4443,6 +4443,33 @@ defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", any_fmul>; } defm FSUB : TwoOperandFPData<0b0011, "fsub", any_fsub>; +multiclass FMULScalarFromIndexedLane0Patterns preds = []> { + let Predicates = !listconcat(preds, [HasFullFP16]) in { + def : Pat<(f16 (OpNode (f16 FPR16:$Rn), + (f16 (vector_extract (v8f16 V128:$Rm), (i64 0))))), + (!cast(inst # inst_f16_suffix) + FPR16:$Rn, (EXTRACT_SUBREG V128:$Rm, hsub))>; + } + let Predicates = preds in { + def : Pat<(f32 (OpNode (f32 FPR32:$Rn), + (f32 (vector_extract (v4f32 V128:$Rm), (i64 0))))), + (!cast(inst # inst_f32_suffix) + FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub))>; + def : Pat<(f64 (OpNode (f64 FPR64:$Rn), + (f64 (vector_extract (v2f64 V128:$Rm), (i64 0))))), + (!cast(inst # inst_f64_suffix) + FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub))>; + } +} + +defm : FMULScalarFromIndexedLane0Patterns<"FMUL", "Hrr", "Srr", "Drr", + any_fmul>; + // Match reassociated forms of FNMUL. def : Pat<(fmul (fneg FPR16:$a), (f16 FPR16:$b)), (FNMULHrr FPR16:$a, FPR16:$b)>, @@ -5248,6 +5275,10 @@ let Predicates = [HasRDM] in { (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>; } +defm : FMULScalarFromIndexedLane0Patterns<"FMULX", "16", "32", "64", + int_aarch64_neon_fmulx, + [HasNEONorSME]>; + def : InstAlias<"cmls $dst, $src1, $src2", (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; def : InstAlias<"cmle $dst, $src1, $src2", diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll index f12f3719e10cf..e17a0a96955b1 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll @@ -17,7 +17,7 @@ entry: ; CHECK-LABEL: %for.body ; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 @@ -59,7 +59,7 @@ entry: ; CHECK-LABEL: %for.body ; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 diff --git a/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll index c9a7ebb7c98be..ce3581030646d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fml-combines.ll @@ -10,7 +10,7 @@ entry: ; CHECK-LABEL: %for.body ; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1 @@ -52,7 +52,7 @@ entry: ; CHECK-LABEL: %for.body ; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} ; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] -; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} for.body: ; preds = %for.body, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll index 1f1d2326492b2..cb87ba9a4ed6c 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -8,6 +8,8 @@ declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) +declare double @llvm.aarch64.neon.fmulx.f64(double, double) + declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32>, <2 x i32>, i32) declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32>, <4 x i32>, i32) @@ -2066,6 +2068,19 @@ entry: ret <4 x float> %vmulx2.i } +define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %v) { +; CHECK-LABEL: test_vmulx_lane_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmulx d0, d0, d1 +; CHECK-NEXT: ret +entry: + %vget_lane = extractelement <1 x double> %a, i64 0 + %vget_lane3 = extractelement <1 x double> %v, i64 0 + %vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vget_lane3) + %vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0 + ret <1 x double> %vset_lane +} + define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) { ; CHECK-LABEL: test_vmulxq_lane_f64: ; CHECK: // %bb.0: // %entry @@ -2100,6 +2115,19 @@ entry: ret <4 x float> %vmulx2.i } +define <1 x double> @test_vmulx_laneq_f64(<1 x double> %a, <2 x double> %v) { +; CHECK-LABEL: test_vmulx_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmulx d0, d0, v1.d[1] +; CHECK-NEXT: ret +entry: + %vget_lane = extractelement <1 x double> %a, i64 0 + %vgetq_lane = extractelement <2 x double> %v, i64 1 + %vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vgetq_lane) + %vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0 + ret <1 x double> %vset_lane +} + define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f64: ; CHECK: // %bb.0: // %entry @@ -3560,7 +3588,7 @@ entry: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmul_laneq_f64_0: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmul d0, d0, v1.d[0] +; CHECK-NEXT: fmul d0, d0, d1 ; CHECK-NEXT: ret entry: %0 = bitcast <1 x double> %a to <8 x i8> @@ -3651,6 +3679,19 @@ entry: ret <4 x float> %vmulx2.i } +define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %v) { +; CHECK-LABEL: test_vmulx_laneq_f64_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmulx d0, d0, d1 +; CHECK-NEXT: ret +entry: + %vget_lane = extractelement <1 x double> %a, i64 0 + %vgetq_lane = extractelement <2 x double> %v, i64 0 + %vmulxd_f64.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %vget_lane, double %vgetq_lane) + %vset_lane = insertelement <1 x double> poison, double %vmulxd_f64.i, i64 0 + ret <1 x double> %vset_lane +} + define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) { ; CHECK-LABEL: test_vmulxq_laneq_f64_0: ; CHECK: // %bb.0: // %entry diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll b/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll index 5d5b940174c4b..091cda89bfe40 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll @@ -1,8 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s -define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) { -; CHECK-LABEL: test_fmul_lane_ss2S: +define float @test_fmul_lane_ss2S_0(float %a, <2 x float> %v) { +; CHECK-LABEL: test_fmul_lane_ss2S_0: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: ret + %tmp1 = extractelement <2 x float> %v, i32 0 + %tmp2 = fmul float %a, %tmp1 + ret float %tmp2 +} + +define float @test_fmul_lane_ss2S_1(float %a, <2 x float> %v) { +; CHECK-LABEL: test_fmul_lane_ss2S_1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: fmul s0, s0, v1.s[1] @@ -12,8 +23,8 @@ define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) { ret float %tmp2; } -define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) { -; CHECK-LABEL: test_fmul_lane_ss2S_swap: +define float @test_fmul_lane_ss2S_1_swap(float %a, <2 x float> %v) { +; CHECK-LABEL: test_fmul_lane_ss2S_1_swap: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: fmul s0, s0, v1.s[1] @@ -23,9 +34,18 @@ define float @test_fmul_lane_ss2S_swap(float %a, <2 x float> %v) { ret float %tmp2; } +define float @test_fmul_lane_ss4S_0(float %a, <4 x float> %v) { +; CHECK-LABEL: test_fmul_lane_ss4S_0: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul s0, s0, s1 +; CHECK-NEXT: ret + %tmp1 = extractelement <4 x float> %v, i32 0 + %tmp2 = fmul float %a, %tmp1 + ret float %tmp2 +} -define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) { -; CHECK-LABEL: test_fmul_lane_ss4S: +define float @test_fmul_lane_ss4S_3(float %a, <4 x float> %v) { +; CHECK-LABEL: test_fmul_lane_ss4S_3: ; CHECK: // %bb.0: ; CHECK-NEXT: fmul s0, s0, v1.s[3] ; CHECK-NEXT: ret @@ -34,8 +54,8 @@ define float @test_fmul_lane_ss4S(float %a, <4 x float> %v) { ret float %tmp2; } -define float @test_fmul_lane_ss4S_swap(float %a, <4 x float> %v) { -; CHECK-LABEL: test_fmul_lane_ss4S_swap: +define float @test_fmul_lane_ss4S_3_swap(float %a, <4 x float> %v) { +; CHECK-LABEL: test_fmul_lane_ss4S_3_swap: ; CHECK: // %bb.0: ; CHECK-NEXT: fmul s0, s0, v1.s[3] ; CHECK-NEXT: ret @@ -56,9 +76,18 @@ define double @test_fmul_lane_ddD(double %a, <1 x double> %v) { } +define double @test_fmul_lane_dd2D_0(double %a, <2 x double> %v) { +; CHECK-LABEL: test_fmul_lane_dd2D_0: +; CHECK: // %bb.0: +; CHECK-NEXT: fmul d0, d0, d1 +; CHECK-NEXT: ret + %tmp1 = extractelement <2 x double> %v, i32 0 + %tmp2 = fmul double %a, %tmp1 + ret double %tmp2 +} -define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) { -; CHECK-LABEL: test_fmul_lane_dd2D: +define double @test_fmul_lane_dd2D_1(double %a, <2 x double> %v) { +; CHECK-LABEL: test_fmul_lane_dd2D_1: ; CHECK: // %bb.0: ; CHECK-NEXT: fmul d0, d0, v1.d[1] ; CHECK-NEXT: ret @@ -68,8 +97,8 @@ define double @test_fmul_lane_dd2D(double %a, <2 x double> %v) { } -define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) { -; CHECK-LABEL: test_fmul_lane_dd2D_swap: +define double @test_fmul_lane_dd2D_1_swap(double %a, <2 x double> %v) { +; CHECK-LABEL: test_fmul_lane_dd2D_1_swap: ; CHECK: // %bb.0: ; CHECK-NEXT: fmul d0, d0, v1.d[1] ; CHECK-NEXT: ret @@ -80,8 +109,19 @@ define double @test_fmul_lane_dd2D_swap(double %a, <2 x double> %v) { declare float @llvm.aarch64.neon.fmulx.f32(float, float) -define float @test_fmulx_lane_f32(float %a, <2 x float> %v) { -; CHECK-LABEL: test_fmulx_lane_f32: +define float @test_fmulx_lane_f32_0(float %a, <2 x float> %v) { +; CHECK-LABEL: test_fmulx_lane_f32_0: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmulx s0, s0, s1 +; CHECK-NEXT: ret + %tmp1 = extractelement <2 x float> %v, i32 0 + %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1) + ret float %tmp2; +} + +define float @test_fmulx_lane_f32_1(float %a, <2 x float> %v) { +; CHECK-LABEL: test_fmulx_lane_f32_1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: fmulx s0, s0, v1.s[1] @@ -91,8 +131,18 @@ define float @test_fmulx_lane_f32(float %a, <2 x float> %v) { ret float %tmp2; } -define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) { -; CHECK-LABEL: test_fmulx_laneq_f32: +define float @test_fmulx_laneq_f32_0(float %a, <4 x float> %v) { +; CHECK-LABEL: test_fmulx_laneq_f32_0: +; CHECK: // %bb.0: +; CHECK-NEXT: fmulx s0, s0, s1 +; CHECK-NEXT: ret + %tmp1 = extractelement <4 x float> %v, i32 0 + %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1) + ret float %tmp2; +} + +define float @test_fmulx_laneq_f32_3(float %a, <4 x float> %v) { +; CHECK-LABEL: test_fmulx_laneq_f32_3: ; CHECK: // %bb.0: ; CHECK-NEXT: fmulx s0, s0, v1.s[3] ; CHECK-NEXT: ret @@ -101,8 +151,8 @@ define float @test_fmulx_laneq_f32(float %a, <4 x float> %v) { ret float %tmp2; } -define float @test_fmulx_laneq_f32_swap(float %a, <4 x float> %v) { -; CHECK-LABEL: test_fmulx_laneq_f32_swap: +define float @test_fmulx_laneq_f32_3_swap(float %a, <4 x float> %v) { +; CHECK-LABEL: test_fmulx_laneq_f32_3_swap: ; CHECK: // %bb.0: ; CHECK-NEXT: fmulx s0, s0, v1.s[3] ; CHECK-NEXT: ret @@ -126,7 +176,7 @@ define double @test_fmulx_lane_f64(double %a, <1 x double> %v) { define double @test_fmulx_laneq_f64_0(double %a, <2 x double> %v) { ; CHECK-LABEL: test_fmulx_laneq_f64_0: ; CHECK: // %bb.0: -; CHECK-NEXT: fmulx d0, d0, v1.d[0] +; CHECK-NEXT: fmulx d0, d0, d1 ; CHECK-NEXT: ret %tmp1 = extractelement <2 x double> %v, i32 0 %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1) @@ -154,3 +204,27 @@ define double @test_fmulx_laneq_f64_1_swap(double %a, <2 x double> %v) { ret double %tmp2; } +define float @test_fmulx_horizontal_f32(<2 x float> %v) { +; CHECK-LABEL: test_fmulx_horizontal_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmulx s0, s0, v0.s[1] +; CHECK-NEXT: ret +entry: + %0 = extractelement <2 x float> %v, i32 0 + %1 = extractelement <2 x float> %v, i32 1 + %2 = call float @llvm.aarch64.neon.fmulx.f32(float %0, float %1) + ret float %2 +} + +define double @test_fmulx_horizontal_f64(<2 x double> %v) { +; CHECK-LABEL: test_fmulx_horizontal_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmulx d0, d0, v0.d[1] +; CHECK-NEXT: ret +entry: + %0 = extractelement <2 x double> %v, i32 0 + %1 = extractelement <2 x double> %v, i32 1 + %2 = call double @llvm.aarch64.neon.fmulx.f64(double %0, double %1) + ret double %2 +} diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll index 72e5b0eef9d02..cb0a9f5236b5d 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll @@ -11,7 +11,7 @@ define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) { ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: mov h3, v0.h[1] ; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: fmul h4, h2, v0.h[0] +; CHECK-NEXT: fmul h4, h0, v1.h[1] ; CHECK-NEXT: fnmul h2, h3, h2 ; CHECK-NEXT: fmla h4, h3, v1.h[0] ; CHECK-NEXT: fmla h2, h0, v1.h[0] diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll index c92ea2fcfe6a8..5d956332de977 100644 --- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll +++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll @@ -228,11 +228,11 @@ entry: ret <8 x half> %mul } -define dso_local half @t_vmulh_lane_f16(half %a, <4 x half> %c, i32 %lane) { -; CHECK-LABEL: t_vmulh_lane_f16: +define dso_local half @t_vmulh_lane0_f16(half %a, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vmulh_lane0_f16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: fmul h0, h0, v1.h[0] +; CHECK-NEXT: fmul h0, h0, h1 ; CHECK-NEXT: ret entry: %0 = extractelement <4 x half> %c, i32 0 @@ -240,10 +240,22 @@ entry: ret half %1 } -define dso_local half @t_vmulh_laneq_f16(half %a, <8 x half> %c, i32 %lane) { -; CHECK-LABEL: t_vmulh_laneq_f16: +define dso_local half @t_vmulh_lane3_f16(half %a, <4 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vmulh_lane3_f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmul h0, h0, v1.h[0] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmul h0, h0, v1.h[3] +; CHECK-NEXT: ret +entry: + %0 = extractelement <4 x half> %c, i32 3 + %1 = fmul half %0, %a + ret half %1 +} + +define dso_local half @t_vmulh_laneq0_f16(half %a, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vmulh_laneq0_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmul h0, h0, h1 ; CHECK-NEXT: ret entry: %0 = extractelement <8 x half> %c, i32 0 @@ -251,6 +263,17 @@ entry: ret half %1 } +define dso_local half @t_vmulh_laneq7_f16(half %a, <8 x half> %c, i32 %lane) { +; CHECK-LABEL: t_vmulh_laneq7_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmul h0, h0, v1.h[7] +; CHECK-NEXT: ret +entry: + %0 = extractelement <8 x half> %c, i32 7 + %1 = fmul half %0, %a + ret half %1 +} + define dso_local half @t_vmulx_f16(half %a, half %b) { ; CHECK-LABEL: t_vmulx_f16: ; CHECK: // %bb.0: // %entry @@ -261,8 +284,20 @@ entry: ret half %fmulx.i } -define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) { -; CHECK-LABEL: t_vmulxh_lane_f16: +define dso_local half @t_vmulxh_lane0_f16(half %a, <4 x half> %b) { +; CHECK-LABEL: t_vmulxh_lane0_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmulx h0, h0, h1 +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x half> %b, i32 0 + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) + ret half %fmulx.i +} + +define dso_local half @t_vmulxh_lane3_f16(half %a, <4 x half> %b, i32 %lane) { +; CHECK-LABEL: t_vmulxh_lane3_f16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: fmulx h0, h0, v1.h[3] @@ -319,8 +354,19 @@ entry: ret <8 x half> %vmulx2.i } -define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) { -; CHECK-LABEL: t_vmulxh_laneq_f16: +define dso_local half @t_vmulxh_laneq0_f16(half %a, <8 x half> %b) { +; CHECK-LABEL: t_vmulxh_laneq0_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmulx h0, h0, h1 +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %b, i32 0 + %fmulx.i = tail call half @llvm.aarch64.neon.fmulx.f16(half %a, half %extract) + ret half %fmulx.i +} + +define dso_local half @t_vmulxh_laneq7_f16(half %a, <8 x half> %b, i32 %lane) { +; CHECK-LABEL: t_vmulxh_laneq7_f16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmulx h0, h0, v1.h[7] ; CHECK-NEXT: ret @@ -418,3 +464,16 @@ entry: %1 = tail call half @llvm.fma.f16(half %b, half %extract, half %a) ret half %1 } + +define half @test_fmulx_horizontal_f16(<2 x half> %v) { +; CHECK-LABEL: test_fmulx_horizontal_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmulx h0, h0, v0.h[1] +; CHECK-NEXT: ret +entry: + %0 = extractelement <2 x half> %v, i32 0 + %1 = extractelement <2 x half> %v, i32 1 + %2 = call half @llvm.aarch64.neon.fmulx.f16(half %0, half %1) + ret half %2 +} diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll index 854e340a4ea01..ce7ae1e426bda 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll @@ -70,15 +70,15 @@ define float @test_v16f32(<16 x float> %a) nounwind { ; CHECK-NEXT: fmul s4, s0, v0.s[1] ; CHECK-NEXT: fmul s4, s4, v0.s[2] ; CHECK-NEXT: fmul s0, s4, v0.s[3] -; CHECK-NEXT: fmul s0, s0, v1.s[0] +; CHECK-NEXT: fmul s0, s0, s1 ; CHECK-NEXT: fmul s0, s0, v1.s[1] ; CHECK-NEXT: fmul s0, s0, v1.s[2] ; CHECK-NEXT: fmul s0, s0, v1.s[3] -; CHECK-NEXT: fmul s0, s0, v2.s[0] +; CHECK-NEXT: fmul s0, s0, s2 ; CHECK-NEXT: fmul s0, s0, v2.s[1] ; CHECK-NEXT: fmul s0, s0, v2.s[2] ; CHECK-NEXT: fmul s0, s0, v2.s[3] -; CHECK-NEXT: fmul s0, s0, v3.s[0] +; CHECK-NEXT: fmul s0, s0, s3 ; CHECK-NEXT: fmul s0, s0, v3.s[1] ; CHECK-NEXT: fmul s0, s0, v3.s[2] ; CHECK-NEXT: fmul s0, s0, v3.s[3]