diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 37e7153be5720e..f2eb32e191430f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2831,6 +2831,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { return Legalized; } case TargetOpcode::G_VECREDUCE_FADD: + case TargetOpcode::G_VECREDUCE_FMUL: case TargetOpcode::G_VECREDUCE_FMIN: case TargetOpcode::G_VECREDUCE_FMAX: case TargetOpcode::G_VECREDUCE_FMINIMUM: diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 8b909f53c84460..301c2b99b38bae 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -985,6 +985,19 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampMaxNumElements(1, s16, 8) .lower(); + // For fmul reductions we need to split up into individual operations. We + // clamp to 128 bit vectors then to 64bit vectors to produce a cascade of + // smaller types, followed by scalarizing what remains. + getActionDefinitionsBuilder(G_VECREDUCE_FMUL) + .minScalarOrElt(0, MinFPScalar) + .clampMaxNumElements(1, s64, 2) + .clampMaxNumElements(1, s32, 4) + .clampMaxNumElements(1, s16, 8) + .clampMaxNumElements(1, s32, 2) + .clampMaxNumElements(1, s16, 4) + .scalarize(1) + .lower(); + getActionDefinitionsBuilder(G_VECREDUCE_ADD) .legalFor({{s8, v16s8}, {s8, v8s8}, diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-fmul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-fmul.mir new file mode 100644 index 00000000000000..9d9e96d35d90d5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-fmul.mir @@ -0,0 +1,31 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- +name: mul_2H +tracksRegLiveness: true +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: mul_2H + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<4 x s32>) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[FMUL]](<4 x s32>) + ; CHECK-NEXT: [[FMUL1:%[0-9]+]]:_(<2 x s32>) = G_FMUL [[UV]], [[UV1]] + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FMUL1]](<2 x s32>) + ; CHECK-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV3]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMUL2]](s32) + ; CHECK-NEXT: $s0 = COPY [[COPY2]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = COPY $q1 + %0:_(<8 x s32>) = G_CONCAT_VECTORS %1(<4 x s32>), %2(<4 x s32>) + %5:_(s32) = nnan ninf nsz arcp contract afn reassoc G_VECREDUCE_FMUL %0(<8 x s32>) + $s0 = COPY %5(s32) + RET_ReallyLR implicit $s0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 178db852e35b7e..866fe6fa8cb3fb 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -736,8 +736,8 @@ # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_VECREDUCE_FMUL (opcode {{[0-9]+}}): 2 type indices, 0 imm indices -# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined -# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: G_VECREDUCE_FMAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll index ff2c5c44d41218..67b4ebb3382487 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll @@ -1,13 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD-NOFP16 -; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD-FP16 +; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16 +; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16 +; RUN: llc -mtriple=aarch64-none-eabi -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16 +; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16 define float @mul_HalfS(<2 x float> %bin.rdx) { -; CHECK-LABEL: mul_HalfS: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mul_HalfS: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mul_HalfS: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx) ret float %r } @@ -40,6 +49,27 @@ define half @mul_HalfH(<4 x half> %bin.rdx) { ; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2] ; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3] ; CHECK-SD-FP16-NEXT: ret +; +; CHECK-GI-NOFP16-LABEL: mul_HalfH: +; CHECK-GI-NOFP16: // %bb.0: +; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] +; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] +; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 +; CHECK-GI-NOFP16-NEXT: ret +; +; CHECK-GI-FP16-LABEL: mul_HalfH: +; CHECK-GI-FP16: // %bb.0: +; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: fmul h1, h2, h3 +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: ret %r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx) ret half %r } @@ -93,17 +123,49 @@ define half @mul_H(<8 x half> %bin.rdx) { ; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2] ; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3] ; CHECK-SD-FP16-NEXT: ret +; +; CHECK-GI-NOFP16-LABEL: mul_H: +; CHECK-GI-NOFP16: // %bb.0: +; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v1.4s, v0.4s +; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] +; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] +; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 +; CHECK-GI-NOFP16-NEXT: ret +; +; CHECK-GI-FP16-LABEL: mul_H: +; CHECK-GI-FP16: // %bb.0: +; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] +; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: fmul h1, h2, h3 +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: ret %r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx) ret half %r } define float @mul_S(<4 x float> %bin.rdx) { -; CHECK-LABEL: mul_S: -; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mul_S: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mul_S: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx) ret float %r } @@ -205,18 +267,56 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2] ; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3] ; CHECK-SD-FP16-NEXT: ret +; +; CHECK-GI-NOFP16-LABEL: mul_2H: +; CHECK-GI-NOFP16: // %bb.0: +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h +; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h +; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v2.4s, v0.4s +; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v1.4s +; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] +; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] +; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 +; CHECK-GI-NOFP16-NEXT: ret +; +; CHECK-GI-FP16-LABEL: mul_2H: +; CHECK-GI-FP16: // %bb.0: +; CHECK-GI-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h +; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] +; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: fmul h1, h2, h3 +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: ret %r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx) ret half %r } define float @mul_2S(<8 x float> %bin.rdx) { -; CHECK-LABEL: mul_2S: -; CHECK: // %bb.0: -; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mul_2S: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mul_2S: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx) ret float %r } @@ -233,15 +333,26 @@ define double @mul_2D(<4 x double> %bin.rdx) { ; added at least one test where the start value is not 1.0. define float @mul_S_init_42(<4 x float> %bin.rdx) { -; CHECK-LABEL: mul_S_init_42: -; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov w8, #1109917696 // =0x42280000 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: fmul s0, s0, s1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mul_S_init_42: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: mov w8, #1109917696 // =0x42280000 +; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: fmov s1, w8 +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: fmul s0, s0, s1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mul_S_init_42: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: mov w8, #1109917696 // =0x42280000 +; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx) ret float %r } @@ -335,6 +446,51 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-SD-FP16-NEXT: fmul h1, h1, v0.h[2] ; CHECK-SD-FP16-NEXT: fmul h0, h1, v0.h[3] ; CHECK-SD-FP16-NEXT: ret +; +; CHECK-GI-NOFP16-LABEL: fmul_reduct_reassoc_v8f16: +; CHECK-GI-NOFP16: // %bb.0: +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v0.4h +; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h +; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v1.4h +; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h +; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v2.4s, v0.4s +; CHECK-GI-NOFP16-NEXT: fmul v1.4s, v3.4s, v1.4s +; CHECK-GI-NOFP16-NEXT: mov d2, v0.d[1] +; CHECK-GI-NOFP16-NEXT: mov d3, v1.d[1] +; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v2.2s +; CHECK-GI-NOFP16-NEXT: fmul v1.2s, v1.2s, v3.2s +; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov s3, v1.s[1] +; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s2 +; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s3 +; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 +; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 +; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 +; CHECK-GI-NOFP16-NEXT: fcvt s1, h1 +; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 +; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 +; CHECK-GI-NOFP16-NEXT: ret +; +; CHECK-GI-FP16-LABEL: fmul_reduct_reassoc_v8f16: +; CHECK-GI-FP16: // %bb.0: +; CHECK-GI-FP16-NEXT: mov d2, v0.d[1] +; CHECK-GI-FP16-NEXT: mov d3, v1.d[1] +; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v2.4h +; CHECK-GI-FP16-NEXT: fmul v1.4h, v1.4h, v3.4h +; CHECK-GI-FP16-NEXT: mov h2, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[2] +; CHECK-GI-FP16-NEXT: mov h4, v0.h[3] +; CHECK-GI-FP16-NEXT: mov h5, v1.h[1] +; CHECK-GI-FP16-NEXT: mov h6, v1.h[2] +; CHECK-GI-FP16-NEXT: mov h7, v1.h[3] +; CHECK-GI-FP16-NEXT: fmul h0, h0, h2 +; CHECK-GI-FP16-NEXT: fmul h2, h3, h4 +; CHECK-GI-FP16-NEXT: fmul h1, h1, h5 +; CHECK-GI-FP16-NEXT: fmul h3, h6, h7 +; CHECK-GI-FP16-NEXT: fmul h0, h0, h2 +; CHECK-GI-FP16-NEXT: fmul h1, h1, h3 +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: ret %r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a) %r2 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %b) %r = fmul fast half %r1, %r2 @@ -342,15 +498,30 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { } define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: fmul_reduct_reassoc_v8f32: -; CHECK: // %bb.0: -; CHECK-NEXT: fmul v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fmul_reduct_reassoc_v8f32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmul v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fmul_reduct_reassoc_v8f32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: fmul v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) %r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b) %r = fmul fast float %r1, %r2 @@ -358,13 +529,26 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { } define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { -; CHECK-LABEL: fmul_reduct_reassoc_v4f32: -; CHECK: // %bb.0: -; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) %r = fmul fast float %r1, %r2 @@ -372,17 +556,31 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { } define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x float> %b) { -; CHECK-LABEL: fmul_reduct_reassoc_v4f32_init: -; CHECK: // %bb.0: -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-NEXT: ext v3.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: fmul s1, s1, v1.s[1] -; CHECK-NEXT: fmul v2.2s, v2.2s, v3.2s -; CHECK-NEXT: fmul s0, s0, s1 -; CHECK-NEXT: fmul s1, s2, v2.s[1] -; CHECK-NEXT: fmul s0, s0, s1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_init: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s +; CHECK-SD-NEXT: ext v3.16b, v2.16b, v2.16b, #8 +; CHECK-SD-NEXT: fmul s1, s1, v1.s[1] +; CHECK-SD-NEXT: fmul v2.2s, v2.2s, v3.2s +; CHECK-SD-NEXT: fmul s0, s0, s1 +; CHECK-SD-NEXT: fmul s1, s2, v2.s[1] +; CHECK-SD-NEXT: fmul s0, s0, s1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_init: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s +; CHECK-GI-NEXT: mov d3, v2.d[1] +; CHECK-GI-NEXT: mov s4, v1.s[1] +; CHECK-GI-NEXT: fmul v2.2s, v2.2s, v3.2s +; CHECK-GI-NEXT: fmul s1, s1, s4 +; CHECK-GI-NEXT: mov s3, v2.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: fmul s1, s2, s3 +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a) %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) %r = fmul fast float %r1, %r2 @@ -390,14 +588,28 @@ define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x floa } define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) { -; CHECK-LABEL: fmul_reduct_reassoc_v4v8f32: -; CHECK: // %bb.0: -; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s -; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fmul_reduct_reassoc_v4v8f32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: fmul v0.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fmul_reduct_reassoc_v4v8f32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) %r2 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %b) %r = fmul fast float %r1, %r2 @@ -405,13 +617,22 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) { } define double @fmul_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) { -; CHECK-LABEL: fmul_reduct_reassoc_v4f64: -; CHECK: // %bb.0: -; CHECK-NEXT: fmul v2.2d, v2.2d, v3.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d -; CHECK-NEXT: fmul v0.2d, v0.2d, v2.2d -; CHECK-NEXT: fmul d0, d0, v0.d[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmul v2.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: fmul v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: fmul v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: fmul d0, d0, v0.d[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmul v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: fmul v1.2d, v2.2d, v3.2d +; CHECK-GI-NEXT: fmul d0, d0, v0.d[1] +; CHECK-GI-NEXT: fmul d1, d1, v1.d[1] +; CHECK-GI-NEXT: fmul d0, d0, d1 +; CHECK-GI-NEXT: ret %r1 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %a) %r2 = call fast double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %b) %r = fmul fast double %r1, %r2 @@ -419,17 +640,31 @@ define double @fmul_reduct_reassoc_v4f64(<4 x double> %a, <4 x double> %b) { } define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) { -; CHECK-LABEL: fmul_reduct_reassoc_v4f32_extrause: -; CHECK: // %bb.0: -; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: fmul v0.2s, v0.2s, v2.2s -; CHECK-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: fmul s1, s1, v1.s[1] -; CHECK-NEXT: fmul s1, s0, s1 -; CHECK-NEXT: fmul s0, s1, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: fmul_reduct_reassoc_v4f32_extrause: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v2.2s +; CHECK-SD-NEXT: fmul v1.2s, v1.2s, v3.2s +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: fmul s1, s1, v1.s[1] +; CHECK-SD-NEXT: fmul s1, s0, s1 +; CHECK-SD-NEXT: fmul s0, s1, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: fmul_reduct_reassoc_v4f32_extrause: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s +; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 +; CHECK-GI-NEXT: fmul s1, s0, s1 +; CHECK-GI-NEXT: fmul s0, s1, s0 +; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) %r = fmul fast float %r1, %r2