diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index cf1b11c14b6d0..9d98e839da869 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1368,7 +1368,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND, - ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT}); + ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT, + ISD::VECREDUCE_FMAXIMUM, ISD::VECREDUCE_FMINIMUM}); if (Subtarget.is64Bit()) setTargetDAGCombine(ISD::SRA); @@ -15650,6 +15651,22 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } + case ISD::VECREDUCE_FMAXIMUM: + case ISD::VECREDUCE_FMINIMUM: { + EVT RT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + + // Reduction fmax/fmin + separate reduction sum to propagate NaNs + unsigned ReducedMinMaxOpc = N->getOpcode() == ISD::VECREDUCE_FMAXIMUM + ? ISD::VECREDUCE_FMAX + : ISD::VECREDUCE_FMIN; + SDValue MinMax = DAG.getNode(ReducedMinMaxOpc, DL, RT, N0); + if (N0->getFlags().hasNoNaNs()) + return MinMax; + SDValue Sum = DAG.getNode(ISD::VECREDUCE_FADD, DL, RT, N0); + SDValue SumIsNonNan = DAG.getSetCC(DL, XLenVT, Sum, Sum, ISD::SETOEQ); + return DAG.getSelect(DL, RT, SumIsNonNan, MinMax, Sum); + } } return SDValue(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index 3f6aa72bc2e3b..2de4b32dab197 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 declare half @llvm.vector.reduce.fadd.v1f16(half, <1 x half>) @@ -1592,3 +1592,949 @@ define float @vreduce_nsz_fadd_v4f32(ptr %x, float %s) { %red = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float %s, <4 x float> %v) ret float %red } + +declare half @llvm.vector.reduce.fminimum.v2f16(<2 x half>) + +define half @vreduce_fminimum_v2f16(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB99_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB99_2: +; CHECK-NEXT: ret + %v = load <2 x half>, ptr %x + %red = call half @llvm.vector.reduce.fminimum.v2f16(<2 x half> %v) + ret half %red +} + +declare half @llvm.vector.reduce.fminimum.v4f16(<4 x half>) + +define half @vreduce_fminimum_v4f16(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB100_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB100_2: +; CHECK-NEXT: ret + %v = load <4 x half>, ptr %x + %red = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %v) + ret half %red +} + +define half @vreduce_fminimum_v4f16_nonans(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v4f16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB101_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB101_2: +; CHECK-NEXT: ret + %v = load <4 x half>, ptr %x + %red = call nnan half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %v) + ret half %red +} + +define half @vreduce_fminimum_v4f16_nonans_noinfs(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v4f16_nonans_noinfs: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB102_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB102_2: +; CHECK-NEXT: ret + %v = load <4 x half>, ptr %x + %red = call nnan ninf half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %v) + ret half %red +} + +declare half @llvm.vector.reduce.fminimum.v128f16(<128 x half>) + +define half @vreduce_fminimum_v128f16(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v128f16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vfadd.vv v24, v8, v16 +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vfredusum.vs v24, v24, v0 +; CHECK-NEXT: vfmv.f.s fa0, v24 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB103_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB103_2: +; CHECK-NEXT: ret + %v = load <128 x half>, ptr %x + %red = call half @llvm.vector.reduce.fminimum.v128f16(<128 x half> %v) + ret half %red +} + +declare float @llvm.vector.reduce.fminimum.v2f32(<2 x float>) + +define float @vreduce_fminimum_v2f32(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB104_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB104_2: +; CHECK-NEXT: ret + %v = load <2 x float>, ptr %x + %red = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %v) + ret float %red +} + +declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float>) + +define float @vreduce_fminimum_v4f32(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB105_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB105_2: +; CHECK-NEXT: ret + %v = load <4 x float>, ptr %x + %red = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %v) + ret float %red +} + +define float @vreduce_fminimum_v4f32_nonans(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v4f32_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB106_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB106_2: +; CHECK-NEXT: ret + %v = load <4 x float>, ptr %x + %red = call nnan float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %v) + ret float %red +} + +define float @vreduce_fminimum_v4f32_nonans_noinfs(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v4f32_nonans_noinfs: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB107_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB107_2: +; CHECK-NEXT: ret + %v = load <4 x float>, ptr %x + %red = call nnan ninf float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %v) + ret float %red +} + +declare float @llvm.vector.reduce.fminimum.v128f32(<128 x float>) + +define float @vreduce_fminimum_v128f32(ptr %x) { +; CHECK-LABEL: vreduce_fminimum_v128f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfadd.vv v8, v0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vfadd.vv v8, v16, v8 +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v16, a0 +; CHECK-NEXT: vfredusum.vs v8, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB108_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v8, v0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmin.vv v16, v16, v24 +; CHECK-NEXT: vfmin.vv v8, v16, v8 +; CHECK-NEXT: vfredmin.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB108_2: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = load <128 x float>, ptr %x + %red = call float @llvm.vector.reduce.fminimum.v128f32(<128 x float> %v) + ret float %red +} + +declare double @llvm.vector.reduce.fminimum.v2f64(<2 x double>) + +define double @vreduce_fminimum_v2f64(ptr %x) { +; RV32-LABEL: vreduce_fminimum_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v9, fa5 +; RV32-NEXT: vfredusum.vs v9, v8, v9 +; RV32-NEXT: vfmv.f.s fa0, v9 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB109_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB109_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vfredusum.vs v9, v8, v9 +; RV64-NEXT: vfmv.f.s fa0, v9 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB109_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB109_2: +; RV64-NEXT: ret + %v = load <2 x double>, ptr %x + %red = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %v) + ret double %red +} + +declare double @llvm.vector.reduce.fminimum.v4f64(<4 x double>) + +define double @vreduce_fminimum_v4f64(ptr %x) { +; RV32-LABEL: vreduce_fminimum_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v10, fa5 +; RV32-NEXT: vfredusum.vs v10, v8, v10 +; RV32-NEXT: vfmv.f.s fa0, v10 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB110_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB110_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vfredusum.vs v10, v8, v10 +; RV64-NEXT: vfmv.f.s fa0, v10 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB110_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB110_2: +; RV64-NEXT: ret + %v = load <4 x double>, ptr %x + %red = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %v) + ret double %red +} + +define double @vreduce_fminimum_v4f64_nonans(ptr %x) { +; RV32-LABEL: vreduce_fminimum_v4f64_nonans: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v10, fa5 +; RV32-NEXT: vfredusum.vs v10, v8, v10 +; RV32-NEXT: vfmv.f.s fa0, v10 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB111_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB111_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v4f64_nonans: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vfredusum.vs v10, v8, v10 +; RV64-NEXT: vfmv.f.s fa0, v10 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB111_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB111_2: +; RV64-NEXT: ret + %v = load <4 x double>, ptr %x + %red = call nnan double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %v) + ret double %red +} + +define double @vreduce_fminimum_v4f64_nonans_noinfs(ptr %x) { +; RV32-LABEL: vreduce_fminimum_v4f64_nonans_noinfs: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v10, fa5 +; RV32-NEXT: vfredusum.vs v10, v8, v10 +; RV32-NEXT: vfmv.f.s fa0, v10 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB112_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB112_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v4f64_nonans_noinfs: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vfredusum.vs v10, v8, v10 +; RV64-NEXT: vfmv.f.s fa0, v10 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB112_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB112_2: +; RV64-NEXT: ret + %v = load <4 x double>, ptr %x + %red = call nnan ninf double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %v) + ret double %red +} + +declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>) + +define double @vreduce_fminimum_v32f64(ptr %x) { +; RV32-LABEL: vreduce_fminimum_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vfadd.vv v24, v8, v16 +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v0, fa5 +; RV32-NEXT: vfredusum.vs v24, v24, v0 +; RV32-NEXT: vfmv.f.s fa0, v24 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB113_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfmin.vv v8, v8, v16 +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB113_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vfadd.vv v24, v8, v16 +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: vfredusum.vs v24, v24, v0 +; RV64-NEXT: vfmv.f.s fa0, v24 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB113_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfmin.vv v8, v8, v16 +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB113_2: +; RV64-NEXT: ret + %v = load <32 x double>, ptr %x + %red = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %v) + ret double %red +} + +declare half @llvm.vector.reduce.fmaximum.v2f16(<2 x half>) + +define half @vreduce_fmaximum_v2f16(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB114_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB114_2: +; CHECK-NEXT: ret + %v = load <2 x half>, ptr %x + %red = call half @llvm.vector.reduce.fmaximum.v2f16(<2 x half> %v) + ret half %red +} + +declare half @llvm.vector.reduce.fmaximum.v4f16(<4 x half>) + +define half @vreduce_fmaximum_v4f16(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB115_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB115_2: +; CHECK-NEXT: ret + %v = load <4 x half>, ptr %x + %red = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %v) + ret half %red +} + +define half @vreduce_fmaximum_v4f16_nonans(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v4f16_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB116_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB116_2: +; CHECK-NEXT: ret + %v = load <4 x half>, ptr %x + %red = call nnan half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %v) + ret half %red +} + +define half @vreduce_fmaximum_v4f16_nonans_noinfs(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v4f16_nonans_noinfs: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB117_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB117_2: +; CHECK-NEXT: ret + %v = load <4 x half>, ptr %x + %red = call nnan ninf half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %v) + ret half %red +} + +declare half @llvm.vector.reduce.fmaximum.v128f16(<128 x half>) + +define half @vreduce_fmaximum_v128f16(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v128f16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 64 +; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vfadd.vv v24, v8, v16 +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vfredusum.vs v24, v24, v0 +; CHECK-NEXT: vfmv.f.s fa0, v24 +; CHECK-NEXT: feq.h a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB118_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB118_2: +; CHECK-NEXT: ret + %v = load <128 x half>, ptr %x + %red = call half @llvm.vector.reduce.fmaximum.v128f16(<128 x half> %v) + ret half %red +} + +declare float @llvm.vector.reduce.fmaximum.v2f32(<2 x float>) + +define float @vreduce_fmaximum_v2f32(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB119_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB119_2: +; CHECK-NEXT: ret + %v = load <2 x float>, ptr %x + %red = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %v) + ret float %red +} + +declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>) + +define float @vreduce_fmaximum_v4f32(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB120_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB120_2: +; CHECK-NEXT: ret + %v = load <4 x float>, ptr %x + %red = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %v) + ret float %red +} + +define float @vreduce_fmaximum_v4f32_nonans(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v4f32_nonans: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB121_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB121_2: +; CHECK-NEXT: ret + %v = load <4 x float>, ptr %x + %red = call nnan float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %v) + ret float %red +} + +define float @vreduce_fmaximum_v4f32_nonans_noinfs(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v4f32_nonans_noinfs: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vfredusum.vs v9, v8, v9 +; CHECK-NEXT: vfmv.f.s fa0, v9 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB122_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB122_2: +; CHECK-NEXT: ret + %v = load <4 x float>, ptr %x + %red = call nnan ninf float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %v) + ret float %red +} + +declare float @llvm.vector.reduce.fmaximum.v128f32(<128 x float>) + +define float @vreduce_fmaximum_v128f32(ptr %x) { +; CHECK-LABEL: vreduce_fmaximum_v128f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: addi a1, a0, 384 +; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfadd.vv v8, v0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vfadd.vv v16, v16, v24 +; CHECK-NEXT: vfadd.vv v8, v16, v8 +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: vmv.s.x v16, a0 +; CHECK-NEXT: vfredusum.vs v8, v8, v16 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: feq.s a0, fa0, fa0 +; CHECK-NEXT: beqz a0, .LBB123_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v8, v0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmax.vv v16, v16, v24 +; CHECK-NEXT: vfmax.vv v8, v16, v8 +; CHECK-NEXT: vfredmax.vs v8, v8, v8 +; CHECK-NEXT: vfmv.f.s fa0, v8 +; CHECK-NEXT: .LBB123_2: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = load <128 x float>, ptr %x + %red = call float @llvm.vector.reduce.fmaximum.v128f32(<128 x float> %v) + ret float %red +} + +declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>) + +define double @vreduce_fmaximum_v2f64(ptr %x) { +; RV32-LABEL: vreduce_fmaximum_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v9, fa5 +; RV32-NEXT: vfredusum.vs v9, v8, v9 +; RV32-NEXT: vfmv.f.s fa0, v9 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB124_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB124_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: vfredusum.vs v9, v8, v9 +; RV64-NEXT: vfmv.f.s fa0, v9 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB124_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB124_2: +; RV64-NEXT: ret + %v = load <2 x double>, ptr %x + %red = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %v) + ret double %red +} + +declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>) + +define double @vreduce_fmaximum_v4f64(ptr %x) { +; RV32-LABEL: vreduce_fmaximum_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v10, fa5 +; RV32-NEXT: vfredusum.vs v10, v8, v10 +; RV32-NEXT: vfmv.f.s fa0, v10 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB125_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB125_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vfredusum.vs v10, v8, v10 +; RV64-NEXT: vfmv.f.s fa0, v10 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB125_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB125_2: +; RV64-NEXT: ret + %v = load <4 x double>, ptr %x + %red = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %v) + ret double %red +} + +define double @vreduce_fmaximum_v4f64_nonans(ptr %x) { +; RV32-LABEL: vreduce_fmaximum_v4f64_nonans: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v10, fa5 +; RV32-NEXT: vfredusum.vs v10, v8, v10 +; RV32-NEXT: vfmv.f.s fa0, v10 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB126_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB126_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v4f64_nonans: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vfredusum.vs v10, v8, v10 +; RV64-NEXT: vfmv.f.s fa0, v10 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB126_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB126_2: +; RV64-NEXT: ret + %v = load <4 x double>, ptr %x + %red = call nnan double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %v) + ret double %red +} + +define double @vreduce_fmaximum_v4f64_nonans_noinfs(ptr %x) { +; RV32-LABEL: vreduce_fmaximum_v4f64_nonans_noinfs: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v10, fa5 +; RV32-NEXT: vfredusum.vs v10, v8, v10 +; RV32-NEXT: vfmv.f.s fa0, v10 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB127_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB127_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v4f64_nonans_noinfs: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vfredusum.vs v10, v8, v10 +; RV64-NEXT: vfmv.f.s fa0, v10 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB127_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB127_2: +; RV64-NEXT: ret + %v = load <4 x double>, ptr %x + %red = call nnan ninf double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %v) + ret double %red +} + +declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>) + +define double @vreduce_fmaximum_v32f64(ptr %x) { +; RV32-LABEL: vreduce_fmaximum_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vfadd.vv v24, v8, v16 +; RV32-NEXT: fcvt.d.w fa5, zero +; RV32-NEXT: fneg.d fa5, fa5 +; RV32-NEXT: vfmv.s.f v0, fa5 +; RV32-NEXT: vfredusum.vs v24, v24, v0 +; RV32-NEXT: vfmv.f.s fa0, v24 +; RV32-NEXT: feq.d a0, fa0, fa0 +; RV32-NEXT: beqz a0, .LBB128_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfmax.vv v8, v8, v16 +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB128_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vfadd.vv v24, v8, v16 +; RV64-NEXT: li a0, -1 +; RV64-NEXT: slli a0, a0, 63 +; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: vfredusum.vs v24, v24, v0 +; RV64-NEXT: vfmv.f.s fa0, v24 +; RV64-NEXT: feq.d a0, fa0, fa0 +; RV64-NEXT: beqz a0, .LBB128_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfmax.vv v8, v8, v16 +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB128_2: +; RV64-NEXT: ret + %v = load <32 x double>, ptr %x + %red = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %v) + ret double %red +}