diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 413af1ff4b943..84a5223f91f01 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -11112,31 +11112,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N, } } -/// Given an integer binary operator, return the generic ISD::VECREDUCE_OP -/// which corresponds to it. -static unsigned getVecReduceOpcode(unsigned Opc) { - switch (Opc) { - default: - llvm_unreachable("Unhandled binary to transfrom reduction"); - case ISD::ADD: - return ISD::VECREDUCE_ADD; - case ISD::UMAX: - return ISD::VECREDUCE_UMAX; - case ISD::SMAX: - return ISD::VECREDUCE_SMAX; - case ISD::UMIN: - return ISD::VECREDUCE_UMIN; - case ISD::SMIN: - return ISD::VECREDUCE_SMIN; - case ISD::AND: - return ISD::VECREDUCE_AND; - case ISD::OR: - return ISD::VECREDUCE_OR; - case ISD::XOR: - return ISD::VECREDUCE_XOR; - } -} - /// Perform two related transforms whose purpose is to incrementally recognize /// an explode_vector followed by scalar reduction as a vector reduction node. /// This exists to recover from a deficiency in SLP which can't handle @@ -11155,15 +11130,8 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG, const SDLoc DL(N); const EVT VT = N->getValueType(0); - - // TODO: Handle floating point here. - if (!VT.isInteger()) - return SDValue(); - - const unsigned Opc = N->getOpcode(); - const unsigned ReduceOpc = getVecReduceOpcode(Opc); - assert(Opc == ISD::getVecReduceBaseOpcode(ReduceOpc) && - "Inconsistent mappings"); + [[maybe_unused]] const unsigned Opc = N->getOpcode(); + assert(Opc == ISD::ADD && "extend this to other reduction types"); const SDValue LHS = N->getOperand(0); const SDValue RHS = N->getOperand(1); @@ -11193,13 +11161,13 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG, EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, 2); SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec, DAG.getVectorIdxConstant(0, DL)); - return DAG.getNode(ReduceOpc, DL, VT, Vec); + return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, Vec); } // Match (binop (reduce (extract_subvector V, 0), // (extract_vector_elt V, sizeof(SubVec)))) // into a reduction of one more element from the original vector V. - if (LHS.getOpcode() != ReduceOpc) + if (LHS.getOpcode() != ISD::VECREDUCE_ADD) return SDValue(); SDValue ReduceVec = LHS.getOperand(0); @@ -11215,7 +11183,7 @@ combineBinOpOfExtractToReduceTree(SDNode *N, SelectionDAG &DAG, EVT ReduceVT = EVT::getVectorVT(*DAG.getContext(), VT, Idx + 1); SDValue Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReduceVT, SrcVec, DAG.getVectorIdxConstant(0, DL)); - return DAG.getNode(ReduceOpc, DL, VT, Vec); + return DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, Vec); } } @@ -11723,8 +11691,6 @@ static SDValue performANDCombine(SDNode *N, if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget)) return V; - if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget)) - return V; if (DCI.isAfterLegalizeDAG()) if (SDValue V = combineDeMorganOfBoolean(N, DAG)) @@ -11777,8 +11743,6 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget)) return V; - if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget)) - return V; if (DCI.isAfterLegalizeDAG()) if (SDValue V = combineDeMorganOfBoolean(N, DAG)) @@ -11830,9 +11794,6 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget)) return V; - if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget)) - return V; - // fold (xor (select cond, 0, y), x) -> // (select cond, x, (xor x, y)) return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget); @@ -14038,13 +13999,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SMAX: case ISD::SMIN: case ISD::FMAXNUM: - case ISD::FMINNUM: { - if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget)) - return V; - if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget)) - return V; - return SDValue(); - } + case ISD::FMINNUM: + return combineBinOpToReduce(N, DAG, Subtarget); case ISD::SETCC: return performSETCCCombine(N, DAG, Subtarget); case ISD::SIGN_EXTEND_INREG: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll index f3570495600f3..ab137b1ac8182 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll @@ -5,10 +5,11 @@ define i8 @explode_2xi8(<2 x i8> %v) { ; CHECK-LABEL: explode_2xi8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: ret %e0 = extractelement <2 x i8> %v, i32 0 %e1 = extractelement <2 x i8> %v, i32 1 @@ -20,16 +21,16 @@ define i8 @explode_4xi8(<4 x i8> %v) { ; CHECK-LABEL: explode_4xi8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 ; CHECK-NEXT: vmv.x.s a1, v9 -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a2, v8 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a2, v9 +; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vmv.x.s a3, v8 +; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: add a2, a2, a3 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: ret %e0 = extractelement <4 x i8> %v, i32 0 %e1 = extractelement <4 x i8> %v, i32 1 @@ -46,28 +47,28 @@ define i8 @explode_8xi8(<8 x i8> %v) { ; CHECK-LABEL: explode_8xi8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 ; CHECK-NEXT: vmv.x.s a1, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 4 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vmv.x.s a2, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 5 +; CHECK-NEXT: vslidedown.vi v9, v8, 3 ; CHECK-NEXT: vmv.x.s a3, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 6 +; CHECK-NEXT: vslidedown.vi v9, v8, 4 ; CHECK-NEXT: vmv.x.s a4, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vslidedown.vi v9, v8, 5 ; CHECK-NEXT: vmv.x.s a5, v9 -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a6, v8 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, a6, a0 +; CHECK-NEXT: vslidedown.vi v9, v8, 6 +; CHECK-NEXT: vmv.x.s a6, v9 +; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vmv.x.s a7, v8 +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: add a2, a2, a3 -; CHECK-NEXT: add a2, a2, a4 ; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: add a0, a0, a5 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, a4, a6 +; CHECK-NEXT: add a0, a0, a4 +; CHECK-NEXT: add a0, a0, a7 ; CHECK-NEXT: ret %e0 = extractelement <8 x i8> %v, i32 0 %e1 = extractelement <8 x i8> %v, i32 1 @@ -88,56 +89,119 @@ define i8 @explode_8xi8(<8 x i8> %v) { } define i8 @explode_16xi8(<16 x i8> %v) { -; CHECK-LABEL: explode_16xi8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 3 -; CHECK-NEXT: vmv.x.s a1, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 4 -; CHECK-NEXT: vmv.x.s a2, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 5 -; CHECK-NEXT: vmv.x.s a3, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 6 -; CHECK-NEXT: vmv.x.s a4, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 7 -; CHECK-NEXT: vmv.x.s a5, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 8 -; CHECK-NEXT: vmv.x.s a6, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 9 -; CHECK-NEXT: vmv.x.s a7, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 10 -; CHECK-NEXT: vmv.x.s t0, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 11 -; CHECK-NEXT: vmv.x.s t1, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 12 -; CHECK-NEXT: vmv.x.s t2, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 13 -; CHECK-NEXT: vmv.x.s t3, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 14 -; CHECK-NEXT: vmv.x.s t4, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 15 -; CHECK-NEXT: vmv.x.s t5, v9 -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s t6, v8 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, t6, a0 -; CHECK-NEXT: add a2, a2, a3 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: add a5, a5, a6 -; CHECK-NEXT: add a5, a5, a7 -; CHECK-NEXT: add a5, a5, t0 -; CHECK-NEXT: add a0, a0, a5 -; CHECK-NEXT: add t1, t1, t2 -; CHECK-NEXT: add t1, t1, t3 -; CHECK-NEXT: add t1, t1, t4 -; CHECK-NEXT: add t1, t1, t5 -; CHECK-NEXT: add a0, a0, t1 -; CHECK-NEXT: ret +; RV32-LABEL: explode_16xi8: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v9, v8, 1 +; RV32-NEXT: vmv.x.s a1, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vmv.x.s a2, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 3 +; RV32-NEXT: vmv.x.s a3, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 4 +; RV32-NEXT: vmv.x.s a4, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 5 +; RV32-NEXT: vmv.x.s a5, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 6 +; RV32-NEXT: vmv.x.s a6, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 7 +; RV32-NEXT: vmv.x.s a7, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 8 +; RV32-NEXT: vmv.x.s t0, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 9 +; RV32-NEXT: vmv.x.s t1, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 10 +; RV32-NEXT: vmv.x.s t2, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 11 +; RV32-NEXT: vmv.x.s t3, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 12 +; RV32-NEXT: vmv.x.s t4, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 13 +; RV32-NEXT: vmv.x.s t5, v9 +; RV32-NEXT: vslidedown.vi v9, v8, 14 +; RV32-NEXT: vmv.x.s t6, v9 +; RV32-NEXT: vslidedown.vi v8, v8, 15 +; RV32-NEXT: vmv.x.s s0, v8 +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a4, a4, a5 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: add a0, a0, a4 +; RV32-NEXT: add a7, a7, t0 +; RV32-NEXT: add a7, a7, t1 +; RV32-NEXT: add a7, a7, t2 +; RV32-NEXT: add a0, a0, a7 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: add t3, t3, t5 +; RV32-NEXT: add t3, t3, t6 +; RV32-NEXT: add t3, t3, s0 +; RV32-NEXT: add a0, a0, t3 +; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: explode_16xi8: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v9, v8, 1 +; RV64-NEXT: vmv.x.s a1, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: vmv.x.s a2, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 3 +; RV64-NEXT: vmv.x.s a3, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 4 +; RV64-NEXT: vmv.x.s a4, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 5 +; RV64-NEXT: vmv.x.s a5, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 6 +; RV64-NEXT: vmv.x.s a6, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 7 +; RV64-NEXT: vmv.x.s a7, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 8 +; RV64-NEXT: vmv.x.s t0, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 9 +; RV64-NEXT: vmv.x.s t1, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 10 +; RV64-NEXT: vmv.x.s t2, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 11 +; RV64-NEXT: vmv.x.s t3, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 12 +; RV64-NEXT: vmv.x.s t4, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 13 +; RV64-NEXT: vmv.x.s t5, v9 +; RV64-NEXT: vslidedown.vi v9, v8, 14 +; RV64-NEXT: vmv.x.s t6, v9 +; RV64-NEXT: vslidedown.vi v8, v8, 15 +; RV64-NEXT: vmv.x.s s0, v8 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: add a4, a4, a6 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: add a7, a7, t0 +; RV64-NEXT: add a7, a7, t1 +; RV64-NEXT: add a7, a7, t2 +; RV64-NEXT: add a0, a0, a7 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: add t3, t3, t5 +; RV64-NEXT: add t3, t3, t6 +; RV64-NEXT: add t3, t3, s0 +; RV64-NEXT: add a0, a0, t3 +; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret %e0 = extractelement <16 x i8> %v, i32 0 %e1 = extractelement <16 x i8> %v, i32 1 %e2 = extractelement <16 x i8> %v, i32 2 @@ -175,10 +239,11 @@ define i8 @explode_16xi8(<16 x i8> %v) { define i16 @explode_2xi16(<2 x i16> %v) { ; CHECK-LABEL: explode_2xi16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: ret %e0 = extractelement <2 x i16> %v, i32 0 %e1 = extractelement <2 x i16> %v, i32 1 @@ -190,16 +255,16 @@ define i16 @explode_4xi16(<4 x i16> %v) { ; CHECK-LABEL: explode_4xi16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 ; CHECK-NEXT: vmv.x.s a1, v9 -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a2, v8 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, a2, a0 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vmv.x.s a2, v9 +; CHECK-NEXT: vslidedown.vi v8, v8, 3 +; CHECK-NEXT: vmv.x.s a3, v8 +; CHECK-NEXT: xor a0, a0, a1 +; CHECK-NEXT: add a2, a2, a3 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: ret %e0 = extractelement <4 x i16> %v, i32 0 %e1 = extractelement <4 x i16> %v, i32 1 @@ -216,28 +281,28 @@ define i16 @explode_8xi16(<8 x i16> %v) { ; CHECK-LABEL: explode_8xi16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 ; CHECK-NEXT: vmv.x.s a1, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 4 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 ; CHECK-NEXT: vmv.x.s a2, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 5 +; CHECK-NEXT: vslidedown.vi v9, v8, 3 ; CHECK-NEXT: vmv.x.s a3, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 6 +; CHECK-NEXT: vslidedown.vi v9, v8, 4 ; CHECK-NEXT: vmv.x.s a4, v9 -; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vslidedown.vi v9, v8, 5 ; CHECK-NEXT: vmv.x.s a5, v9 -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a6, v8 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, a6, a0 +; CHECK-NEXT: vslidedown.vi v9, v8, 6 +; CHECK-NEXT: vmv.x.s a6, v9 +; CHECK-NEXT: vslidedown.vi v8, v8, 7 +; CHECK-NEXT: vmv.x.s a7, v8 +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: add a2, a2, a3 -; CHECK-NEXT: add a2, a2, a4 ; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: add a0, a0, a5 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, a4, a6 +; CHECK-NEXT: add a0, a0, a4 +; CHECK-NEXT: add a0, a0, a7 ; CHECK-NEXT: ret %e0 = extractelement <8 x i16> %v, i32 0 %e1 = extractelement <8 x i16> %v, i32 1 @@ -258,57 +323,121 @@ define i16 @explode_8xi16(<8 x i16> %v) { } define i16 @explode_16xi16(<16 x i16> %v) { -; CHECK-LABEL: explode_16xi16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 2 -; CHECK-NEXT: vmv.x.s a0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 3 -; CHECK-NEXT: vmv.x.s a1, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 4 -; CHECK-NEXT: vmv.x.s a2, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 5 -; CHECK-NEXT: vmv.x.s a3, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 6 -; CHECK-NEXT: vmv.x.s a4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 7 -; CHECK-NEXT: vmv.x.s a5, v10 -; CHECK-NEXT: vsetivli zero, 1, e16, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 8 -; CHECK-NEXT: vmv.x.s a6, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 9 -; CHECK-NEXT: vmv.x.s a7, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 10 -; CHECK-NEXT: vmv.x.s t0, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 11 -; CHECK-NEXT: vmv.x.s t1, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 12 -; CHECK-NEXT: vmv.x.s t2, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 13 -; CHECK-NEXT: vmv.x.s t3, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 14 -; CHECK-NEXT: vmv.x.s t4, v10 -; CHECK-NEXT: vslidedown.vi v10, v8, 15 -; CHECK-NEXT: vmv.x.s t5, v10 -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s t6, v8 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: add a0, t6, a0 -; CHECK-NEXT: add a2, a2, a3 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: add a0, a0, a2 -; CHECK-NEXT: add a5, a5, a6 -; CHECK-NEXT: add a5, a5, a7 -; CHECK-NEXT: add a5, a5, t0 -; CHECK-NEXT: add a0, a0, a5 -; CHECK-NEXT: add t1, t1, t2 -; CHECK-NEXT: add t1, t1, t3 -; CHECK-NEXT: add t1, t1, t4 -; CHECK-NEXT: add t1, t1, t5 -; CHECK-NEXT: add a0, a0, t1 -; CHECK-NEXT: ret +; RV32-LABEL: explode_16xi16: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vmv.x.s a1, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 2 +; RV32-NEXT: vmv.x.s a2, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 3 +; RV32-NEXT: vmv.x.s a3, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 4 +; RV32-NEXT: vmv.x.s a4, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 5 +; RV32-NEXT: vmv.x.s a5, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 6 +; RV32-NEXT: vmv.x.s a6, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 7 +; RV32-NEXT: vmv.x.s a7, v10 +; RV32-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV32-NEXT: vslidedown.vi v10, v8, 8 +; RV32-NEXT: vmv.x.s t0, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 9 +; RV32-NEXT: vmv.x.s t1, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 10 +; RV32-NEXT: vmv.x.s t2, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 11 +; RV32-NEXT: vmv.x.s t3, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 12 +; RV32-NEXT: vmv.x.s t4, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 13 +; RV32-NEXT: vmv.x.s t5, v10 +; RV32-NEXT: vslidedown.vi v10, v8, 14 +; RV32-NEXT: vmv.x.s t6, v10 +; RV32-NEXT: vslidedown.vi v8, v8, 15 +; RV32-NEXT: vmv.x.s s0, v8 +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a4, a4, a5 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: add a0, a0, a4 +; RV32-NEXT: add a7, a7, t0 +; RV32-NEXT: add a7, a7, t1 +; RV32-NEXT: add a7, a7, t2 +; RV32-NEXT: add a0, a0, a7 +; RV32-NEXT: add t3, t3, t4 +; RV32-NEXT: add t3, t3, t5 +; RV32-NEXT: add t3, t3, t6 +; RV32-NEXT: add t3, t3, s0 +; RV32-NEXT: add a0, a0, t3 +; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: explode_16xi16: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset s0, -8 +; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v10, v8, 1 +; RV64-NEXT: vmv.x.s a1, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 2 +; RV64-NEXT: vmv.x.s a2, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 3 +; RV64-NEXT: vmv.x.s a3, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 4 +; RV64-NEXT: vmv.x.s a4, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 5 +; RV64-NEXT: vmv.x.s a5, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 6 +; RV64-NEXT: vmv.x.s a6, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 7 +; RV64-NEXT: vmv.x.s a7, v10 +; RV64-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV64-NEXT: vslidedown.vi v10, v8, 8 +; RV64-NEXT: vmv.x.s t0, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 9 +; RV64-NEXT: vmv.x.s t1, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 10 +; RV64-NEXT: vmv.x.s t2, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 11 +; RV64-NEXT: vmv.x.s t3, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 12 +; RV64-NEXT: vmv.x.s t4, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 13 +; RV64-NEXT: vmv.x.s t5, v10 +; RV64-NEXT: vslidedown.vi v10, v8, 14 +; RV64-NEXT: vmv.x.s t6, v10 +; RV64-NEXT: vslidedown.vi v8, v8, 15 +; RV64-NEXT: vmv.x.s s0, v8 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: add a4, a4, a6 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: add a7, a7, t0 +; RV64-NEXT: add a7, a7, t1 +; RV64-NEXT: add a7, a7, t2 +; RV64-NEXT: add a0, a0, a7 +; RV64-NEXT: add t3, t3, t4 +; RV64-NEXT: add t3, t3, t5 +; RV64-NEXT: add t3, t3, t6 +; RV64-NEXT: add t3, t3, s0 +; RV64-NEXT: add a0, a0, t3 +; RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret %e0 = extractelement <16 x i16> %v, i32 0 %e1 = extractelement <16 x i16> %v, i32 1 %e2 = extractelement <16 x i16> %v, i32 2 @@ -346,10 +475,11 @@ define i16 @explode_16xi16(<16 x i16> %v) { define i32 @explode_2xi32(<2 x i32> %v) { ; CHECK-LABEL: explode_2xi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vmv.x.s a1, v8 +; CHECK-NEXT: xor a0, a0, a1 ; CHECK-NEXT: ret %e0 = extractelement <2 x i32> %v, i32 0 %e1 = extractelement <2 x i32> %v, i32 1 @@ -361,31 +491,31 @@ define i32 @explode_4xi32(<4 x i32> %v) { ; RV32-LABEL: explode_4xi32: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32-NEXT: vslidedown.vi v9, v8, 2 -; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: vslidedown.vi v9, v8, 3 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v9, v8, 1 ; RV32-NEXT: vmv.x.s a1, v9 -; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vredxor.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a2, v8 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: vmv.x.s a2, v9 +; RV32-NEXT: vslidedown.vi v8, v8, 3 +; RV32-NEXT: vmv.x.s a3, v8 +; RV32-NEXT: xor a0, a0, a1 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: explode_4xi32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64-NEXT: vslidedown.vi v9, v8, 2 -; RV64-NEXT: vmv.x.s a0, v9 -; RV64-NEXT: vslidedown.vi v9, v8, 3 +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v9, v8, 1 ; RV64-NEXT: vmv.x.s a1, v9 -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vredxor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a2, v8 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: addw a0, a2, a0 +; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: vmv.x.s a2, v9 +; RV64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-NEXT: vmv.x.s a3, v8 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: addw a0, a0, a2 ; RV64-NEXT: ret %e0 = extractelement <4 x i32> %v, i32 0 %e1 = extractelement <4 x i32> %v, i32 1 @@ -402,57 +532,57 @@ define i32 @explode_8xi32(<8 x i32> %v) { ; RV32-LABEL: explode_8xi32: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v10, v8, 1 +; RV32-NEXT: vmv.x.s a1, v10 ; RV32-NEXT: vslidedown.vi v10, v8, 2 -; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: vmv.x.s a2, v10 ; RV32-NEXT: vslidedown.vi v10, v8, 3 -; RV32-NEXT: vmv.x.s a1, v10 +; RV32-NEXT: vmv.x.s a3, v10 ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32-NEXT: vslidedown.vi v10, v8, 4 -; RV32-NEXT: vmv.x.s a2, v10 -; RV32-NEXT: vslidedown.vi v10, v8, 5 -; RV32-NEXT: vmv.x.s a3, v10 -; RV32-NEXT: vslidedown.vi v10, v8, 6 ; RV32-NEXT: vmv.x.s a4, v10 -; RV32-NEXT: vslidedown.vi v10, v8, 7 +; RV32-NEXT: vslidedown.vi v10, v8, 5 ; RV32-NEXT: vmv.x.s a5, v10 -; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vredxor.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a6, v8 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a6, a0 +; RV32-NEXT: vslidedown.vi v10, v8, 6 +; RV32-NEXT: vmv.x.s a6, v10 +; RV32-NEXT: vslidedown.vi v8, v8, 7 +; RV32-NEXT: vmv.x.s a7, v8 +; RV32-NEXT: xor a0, a0, a1 ; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: add a2, a2, a4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a0, a0, a5 +; RV32-NEXT: add a4, a4, a5 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: add a0, a0, a4 +; RV32-NEXT: add a0, a0, a7 ; RV32-NEXT: ret ; ; RV64-LABEL: explode_8xi32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v10, v8, 1 +; RV64-NEXT: vmv.x.s a1, v10 ; RV64-NEXT: vslidedown.vi v10, v8, 2 -; RV64-NEXT: vmv.x.s a0, v10 +; RV64-NEXT: vmv.x.s a2, v10 ; RV64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-NEXT: vmv.x.s a1, v10 +; RV64-NEXT: vmv.x.s a3, v10 ; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64-NEXT: vslidedown.vi v10, v8, 4 -; RV64-NEXT: vmv.x.s a2, v10 -; RV64-NEXT: vslidedown.vi v10, v8, 5 -; RV64-NEXT: vmv.x.s a3, v10 -; RV64-NEXT: vslidedown.vi v10, v8, 6 ; RV64-NEXT: vmv.x.s a4, v10 -; RV64-NEXT: vslidedown.vi v10, v8, 7 +; RV64-NEXT: vslidedown.vi v10, v8, 5 ; RV64-NEXT: vmv.x.s a5, v10 -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vredxor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a6, v8 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, a6, a0 +; RV64-NEXT: vslidedown.vi v10, v8, 6 +; RV64-NEXT: vmv.x.s a6, v10 +; RV64-NEXT: vslidedown.vi v8, v8, 7 +; RV64-NEXT: vmv.x.s a7, v8 +; RV64-NEXT: xor a0, a0, a1 ; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: add a2, a2, a4 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: addw a0, a0, a5 +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: add a4, a4, a6 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: addw a0, a0, a7 ; RV64-NEXT: ret %e0 = extractelement <8 x i32> %v, i32 0 %e1 = extractelement <8 x i32> %v, i32 1 @@ -479,57 +609,60 @@ define i32 @explode_16xi32(<16 x i32> %v) { ; RV32-NEXT: .cfi_def_cfa_offset 128 ; RV32-NEXT: sw ra, 124(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 116(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: .cfi_offset s2, -12 ; RV32-NEXT: addi s0, sp, 128 ; RV32-NEXT: .cfi_def_cfa s0, 0 ; RV32-NEXT: andi sp, sp, -64 ; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vslidedown.vi v12, v8, 1 +; RV32-NEXT: vmv.x.s a1, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 2 -; RV32-NEXT: vmv.x.s a0, v12 +; RV32-NEXT: vmv.x.s a2, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 3 -; RV32-NEXT: vmv.x.s a1, v12 +; RV32-NEXT: vmv.x.s a3, v12 ; RV32-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV32-NEXT: vslidedown.vi v12, v8, 4 -; RV32-NEXT: vmv.x.s a2, v12 +; RV32-NEXT: vmv.x.s a4, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 5 -; RV32-NEXT: vmv.x.s a3, v12 +; RV32-NEXT: vmv.x.s a5, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 6 -; RV32-NEXT: vmv.x.s a4, v12 +; RV32-NEXT: vmv.x.s a6, v12 ; RV32-NEXT: vslidedown.vi v12, v8, 7 -; RV32-NEXT: vmv.x.s a5, v12 -; RV32-NEXT: mv a6, sp +; RV32-NEXT: vmv.x.s a7, v12 +; RV32-NEXT: mv t0, sp ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (a6) -; RV32-NEXT: lw a6, 32(sp) -; RV32-NEXT: lw a7, 36(sp) -; RV32-NEXT: lw t0, 40(sp) -; RV32-NEXT: lw t1, 44(sp) -; RV32-NEXT: lw t2, 48(sp) -; RV32-NEXT: lw t3, 52(sp) -; RV32-NEXT: lw t4, 56(sp) -; RV32-NEXT: lw t5, 60(sp) -; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vredxor.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s t6, v8 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, t6, a0 +; RV32-NEXT: vse32.v v8, (t0) +; RV32-NEXT: lw t0, 32(sp) +; RV32-NEXT: lw t1, 36(sp) +; RV32-NEXT: lw t2, 40(sp) +; RV32-NEXT: lw t3, 44(sp) +; RV32-NEXT: lw t4, 48(sp) +; RV32-NEXT: lw t5, 52(sp) +; RV32-NEXT: lw t6, 56(sp) +; RV32-NEXT: lw s2, 60(sp) +; RV32-NEXT: xor a0, a0, a1 ; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: add a2, a2, a4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: add a0, a0, a5 +; RV32-NEXT: add a4, a4, a5 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: add a0, a0, a4 ; RV32-NEXT: add a7, a7, t0 -; RV32-NEXT: add a7, a7, t1 ; RV32-NEXT: add a0, a0, a7 -; RV32-NEXT: add t2, t2, t3 -; RV32-NEXT: add t2, t2, t4 -; RV32-NEXT: add t2, t2, t5 -; RV32-NEXT: add a0, a0, t2 +; RV32-NEXT: add t1, t1, t2 +; RV32-NEXT: add t1, t1, t3 +; RV32-NEXT: add a0, a0, t1 +; RV32-NEXT: add t4, t4, t5 +; RV32-NEXT: add t4, t4, t6 +; RV32-NEXT: add t4, t4, s2 +; RV32-NEXT: add a0, a0, t4 ; RV32-NEXT: addi sp, s0, -128 ; RV32-NEXT: lw ra, 124(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 116(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 128 ; RV32-NEXT: ret ; @@ -539,57 +672,60 @@ define i32 @explode_16xi32(<16 x i32> %v) { ; RV64-NEXT: .cfi_def_cfa_offset 128 ; RV64-NEXT: sd ra, 120(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 104(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s2, -24 ; RV64-NEXT: addi s0, sp, 128 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -64 ; RV64-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v12, v8, 1 +; RV64-NEXT: vmv.x.s a1, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 2 -; RV64-NEXT: vmv.x.s a0, v12 +; RV64-NEXT: vmv.x.s a2, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 3 -; RV64-NEXT: vmv.x.s a1, v12 +; RV64-NEXT: vmv.x.s a3, v12 ; RV64-NEXT: vsetivli zero, 1, e32, m2, ta, ma ; RV64-NEXT: vslidedown.vi v12, v8, 4 -; RV64-NEXT: vmv.x.s a2, v12 +; RV64-NEXT: vmv.x.s a4, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 5 -; RV64-NEXT: vmv.x.s a3, v12 +; RV64-NEXT: vmv.x.s a5, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 6 -; RV64-NEXT: vmv.x.s a4, v12 +; RV64-NEXT: vmv.x.s a6, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 7 -; RV64-NEXT: vmv.x.s a5, v12 -; RV64-NEXT: mv a6, sp +; RV64-NEXT: vmv.x.s a7, v12 +; RV64-NEXT: mv t0, sp ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vse32.v v8, (a6) -; RV64-NEXT: lw a6, 32(sp) -; RV64-NEXT: lw a7, 36(sp) -; RV64-NEXT: lw t0, 40(sp) -; RV64-NEXT: lw t1, 44(sp) -; RV64-NEXT: lw t2, 48(sp) -; RV64-NEXT: lw t3, 52(sp) -; RV64-NEXT: lw t4, 56(sp) -; RV64-NEXT: lw t5, 60(sp) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vredxor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s t6, v8 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, t6, a0 +; RV64-NEXT: vse32.v v8, (t0) +; RV64-NEXT: lw t0, 32(sp) +; RV64-NEXT: lw t1, 36(sp) +; RV64-NEXT: lw t2, 40(sp) +; RV64-NEXT: lw t3, 44(sp) +; RV64-NEXT: lw t4, 48(sp) +; RV64-NEXT: lw t5, 52(sp) +; RV64-NEXT: lw t6, 56(sp) +; RV64-NEXT: lw s2, 60(sp) +; RV64-NEXT: xor a0, a0, a1 ; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: add a2, a2, a4 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: add a0, a0, a5 +; RV64-NEXT: add a4, a4, a5 +; RV64-NEXT: add a4, a4, a6 +; RV64-NEXT: add a0, a0, a4 ; RV64-NEXT: add a7, a7, t0 -; RV64-NEXT: add a7, a7, t1 ; RV64-NEXT: add a0, a0, a7 -; RV64-NEXT: add t2, t2, t3 -; RV64-NEXT: add t2, t2, t4 -; RV64-NEXT: add t2, t2, t5 -; RV64-NEXT: addw a0, a0, t2 +; RV64-NEXT: add t1, t1, t2 +; RV64-NEXT: add t1, t1, t3 +; RV64-NEXT: add a0, a0, t1 +; RV64-NEXT: add t4, t4, t5 +; RV64-NEXT: add t4, t4, t6 +; RV64-NEXT: add t4, t4, s2 +; RV64-NEXT: addw a0, a0, t4 ; RV64-NEXT: addi sp, s0, -128 ; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 104(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 128 ; RV64-NEXT: ret %e0 = extractelement <16 x i32> %v, i32 0 @@ -629,22 +765,26 @@ define i32 @explode_16xi32(<16 x i32> %v) { define i64 @explode_2xi64(<2 x i64> %v) { ; RV32-LABEL: explode_2xi64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vredxor.vs v8, v8, v9 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 32 +; RV32-NEXT: li a0, 32 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a1 -; RV32-NEXT: vmv.x.s a1, v8 +; RV32-NEXT: vsrl.vx v9, v8, a0 +; RV32-NEXT: vmv.x.s a1, v9 +; RV32-NEXT: vmv.x.s a2, v8 +; RV32-NEXT: vslidedown.vi v8, v8, 1 +; RV32-NEXT: vsrl.vx v9, v8, a0 +; RV32-NEXT: vmv.x.s a0, v9 +; RV32-NEXT: vmv.x.s a3, v8 +; RV32-NEXT: xor a1, a1, a0 +; RV32-NEXT: xor a0, a2, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: explode_2xi64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vredxor.vs v8, v8, v9 +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v8, v8, 1 +; RV64-NEXT: vmv.x.s a1, v8 +; RV64-NEXT: xor a0, a0, a1 ; RV64-NEXT: ret %e0 = extractelement <2 x i64> %v, i32 0 %e1 = extractelement <2 x i64> %v, i32 1 @@ -655,46 +795,49 @@ define i64 @explode_2xi64(<2 x i64> %v) { define i64 @explode_4xi64(<4 x i64> %v) { ; RV32-LABEL: explode_4xi64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v8, 2 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v12, v10, a0 -; RV32-NEXT: vmv.x.s a1, v12 -; RV32-NEXT: vmv.x.s a2, v10 -; RV32-NEXT: vslidedown.vi v10, v8, 3 +; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma +; RV32-NEXT: vsrl.vx v10, v8, a0 +; RV32-NEXT: vmv.x.s a1, v10 +; RV32-NEXT: vmv.x.s a2, v8 +; RV32-NEXT: vslidedown.vi v10, v8, 1 ; RV32-NEXT: vsrl.vx v12, v10, a0 ; RV32-NEXT: vmv.x.s a3, v12 ; RV32-NEXT: vmv.x.s a4, v10 -; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vredxor.vs v8, v8, v9 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: vmv.x.s a5, v8 -; RV32-NEXT: add a2, a5, a2 -; RV32-NEXT: sltu a5, a2, a5 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, a5 -; RV32-NEXT: add a1, a0, a3 -; RV32-NEXT: add a0, a2, a4 -; RV32-NEXT: sltu a2, a0, a2 +; RV32-NEXT: vslidedown.vi v10, v8, 2 +; RV32-NEXT: vsrl.vx v12, v10, a0 +; RV32-NEXT: vmv.x.s a5, v12 +; RV32-NEXT: vmv.x.s a6, v10 +; RV32-NEXT: vslidedown.vi v8, v8, 3 +; RV32-NEXT: vsrl.vx v10, v8, a0 +; RV32-NEXT: vmv.x.s a0, v10 +; RV32-NEXT: vmv.x.s a7, v8 +; RV32-NEXT: xor a1, a1, a3 +; RV32-NEXT: xor a2, a2, a4 +; RV32-NEXT: add a6, a2, a6 +; RV32-NEXT: sltu a2, a6, a2 +; RV32-NEXT: add a1, a1, a5 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: add a0, a6, a7 +; RV32-NEXT: sltu a2, a0, a6 ; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: explode_4xi64: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v10, v8, 1 +; RV64-NEXT: vmv.x.s a1, v10 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-NEXT: vslidedown.vi v10, v8, 2 -; RV64-NEXT: vmv.x.s a0, v10 -; RV64-NEXT: vslidedown.vi v10, v8, 3 -; RV64-NEXT: vmv.x.s a1, v10 -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vredxor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a2, v8 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, a2, a0 +; RV64-NEXT: vmv.x.s a2, v10 +; RV64-NEXT: vslidedown.vi v8, v8, 3 +; RV64-NEXT: vmv.x.s a3, v8 +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: add a0, a0, a2 ; RV64-NEXT: ret %e0 = extractelement <4 x i64> %v, i32 0 %e1 = extractelement <4 x i64> %v, i32 1 @@ -710,63 +853,71 @@ define i64 @explode_4xi64(<4 x i64> %v) { define i64 @explode_8xi64(<8 x i64> %v) { ; RV32-LABEL: explode_8xi64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma -; RV32-NEXT: vslidedown.vi v12, v8, 2 +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v16, v12, a0 -; RV32-NEXT: vmv.x.s a1, v16 -; RV32-NEXT: vmv.x.s a2, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 3 +; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma +; RV32-NEXT: vsrl.vx v12, v8, a0 +; RV32-NEXT: vmv.x.s a1, v12 +; RV32-NEXT: vmv.x.s a2, v8 +; RV32-NEXT: vslidedown.vi v12, v8, 1 ; RV32-NEXT: vsrl.vx v16, v12, a0 ; RV32-NEXT: vmv.x.s a3, v16 ; RV32-NEXT: vmv.x.s a4, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 4 +; RV32-NEXT: vslidedown.vi v12, v8, 2 ; RV32-NEXT: vsrl.vx v16, v12, a0 ; RV32-NEXT: vmv.x.s a5, v16 ; RV32-NEXT: vmv.x.s a6, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 5 +; RV32-NEXT: vslidedown.vi v12, v8, 3 ; RV32-NEXT: vsrl.vx v16, v12, a0 ; RV32-NEXT: vmv.x.s a7, v16 ; RV32-NEXT: vmv.x.s t0, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 6 +; RV32-NEXT: vslidedown.vi v12, v8, 4 ; RV32-NEXT: vsrl.vx v16, v12, a0 ; RV32-NEXT: vmv.x.s t1, v16 ; RV32-NEXT: vmv.x.s t2, v12 -; RV32-NEXT: vslidedown.vi v12, v8, 7 +; RV32-NEXT: vslidedown.vi v12, v8, 5 ; RV32-NEXT: vsrl.vx v16, v12, a0 ; RV32-NEXT: vmv.x.s t3, v16 ; RV32-NEXT: vmv.x.s t4, v12 -; RV32-NEXT: vmv.s.x v9, zero -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vredxor.vs v8, v8, v9 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: vmv.x.s t5, v8 -; RV32-NEXT: add a2, t5, a2 -; RV32-NEXT: sltu t5, a2, t5 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, t5 -; RV32-NEXT: add a0, a0, a3 -; RV32-NEXT: add a4, a2, a4 -; RV32-NEXT: sltu a1, a4, a2 +; RV32-NEXT: vslidedown.vi v12, v8, 6 +; RV32-NEXT: vsrl.vx v16, v12, a0 +; RV32-NEXT: vmv.x.s t5, v16 +; RV32-NEXT: vmv.x.s t6, v12 +; RV32-NEXT: vslidedown.vi v8, v8, 7 +; RV32-NEXT: vsrl.vx v12, v8, a0 +; RV32-NEXT: vmv.x.s a0, v12 +; RV32-NEXT: vmv.x.s s0, v8 +; RV32-NEXT: xor a1, a1, a3 +; RV32-NEXT: xor a2, a2, a4 +; RV32-NEXT: add a6, a2, a6 +; RV32-NEXT: sltu a2, a6, a2 ; RV32-NEXT: add a1, a1, a5 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a6, a4, a6 -; RV32-NEXT: sltu a1, a6, a4 +; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: add a1, a1, a7 -; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add t0, a6, t0 -; RV32-NEXT: sltu a1, t0, a6 -; RV32-NEXT: add a1, a1, t1 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t0, a6 +; RV32-NEXT: add a2, a2, t1 +; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: add t2, t0, t2 -; RV32-NEXT: sltu a1, t2, t0 -; RV32-NEXT: add a1, a1, t3 -; RV32-NEXT: add a1, a0, a1 -; RV32-NEXT: add a0, t2, t4 -; RV32-NEXT: sltu a2, a0, t2 +; RV32-NEXT: sltu a2, t2, t0 +; RV32-NEXT: add a2, a2, t3 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add t4, t2, t4 +; RV32-NEXT: sltu a2, t4, t2 +; RV32-NEXT: add a2, a2, t5 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add t6, t4, t6 +; RV32-NEXT: sltu a2, t6, t4 +; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: add a0, t6, s0 +; RV32-NEXT: sltu a2, a0, t6 ; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: explode_8xi64: @@ -780,28 +931,29 @@ define i64 @explode_8xi64(<8 x i64> %v) { ; RV64-NEXT: addi s0, sp, 128 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -64 +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v12, v8, 1 +; RV64-NEXT: vmv.x.s a1, v12 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-NEXT: vslidedown.vi v12, v8, 2 -; RV64-NEXT: vmv.x.s a0, v12 +; RV64-NEXT: vmv.x.s a2, v12 ; RV64-NEXT: vslidedown.vi v12, v8, 3 -; RV64-NEXT: vmv.x.s a1, v12 -; RV64-NEXT: mv a2, sp +; RV64-NEXT: vmv.x.s a3, v12 +; RV64-NEXT: mv a4, sp ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: ld a2, 32(sp) -; RV64-NEXT: ld a3, 40(sp) -; RV64-NEXT: ld a4, 48(sp) -; RV64-NEXT: ld a5, 56(sp) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vredxor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s a6, v8 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, a6, a0 +; RV64-NEXT: vse64.v v8, (a4) +; RV64-NEXT: ld a4, 32(sp) +; RV64-NEXT: ld a5, 40(sp) +; RV64-NEXT: ld a6, 48(sp) +; RV64-NEXT: ld a7, 56(sp) +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: add a0, a0, a4 +; RV64-NEXT: add a5, a5, a6 ; RV64-NEXT: add a0, a0, a5 +; RV64-NEXT: add a0, a0, a7 ; RV64-NEXT: addi sp, s0, -128 ; RV64-NEXT: ld ra, 120(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 112(sp) # 8-byte Folded Reload @@ -856,130 +1008,130 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV32-NEXT: .cfi_offset s9, -44 ; RV32-NEXT: .cfi_offset s10, -48 ; RV32-NEXT: .cfi_offset s11, -52 -; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 2 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s t6, v24 +; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vx v16, v8, a0 ; RV32-NEXT: vmv.x.s a1, v16 ; RV32-NEXT: sw a1, 8(sp) # 4-byte Folded Spill -; RV32-NEXT: vslidedown.vi v16, v8, 3 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s a1, v24 -; RV32-NEXT: sw a1, 4(sp) # 4-byte Folded Spill -; RV32-NEXT: vmv.x.s a2, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 4 -; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s0, v24 -; RV32-NEXT: vmv.x.s a3, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 5 +; RV32-NEXT: vmv.x.s a2, v8 +; RV32-NEXT: vslidedown.vi v16, v8, 1 ; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s1, v24 +; RV32-NEXT: vmv.x.s a3, v24 ; RV32-NEXT: vmv.x.s a4, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 6 +; RV32-NEXT: vslidedown.vi v16, v8, 2 ; RV32-NEXT: vsrl.vx v24, v16, a0 -; RV32-NEXT: vmv.x.s s2, v24 -; RV32-NEXT: vmv.x.s a5, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 7 +; RV32-NEXT: vmv.x.s a5, v24 +; RV32-NEXT: vmv.x.s a6, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 3 +; RV32-NEXT: vsrl.vx v24, v16, a0 +; RV32-NEXT: vmv.x.s a7, v24 +; RV32-NEXT: vmv.x.s t0, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 4 ; RV32-NEXT: vsrl.vx v24, v16, a0 ; RV32-NEXT: vmv.x.s s3, v24 -; RV32-NEXT: vmv.x.s a6, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 8 +; RV32-NEXT: vmv.x.s t1, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 5 ; RV32-NEXT: vsrl.vx v24, v16, a0 ; RV32-NEXT: vmv.x.s s4, v24 -; RV32-NEXT: vmv.x.s a7, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 9 +; RV32-NEXT: vmv.x.s t2, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 6 ; RV32-NEXT: vsrl.vx v24, v16, a0 ; RV32-NEXT: vmv.x.s s5, v24 -; RV32-NEXT: vmv.x.s t0, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 10 +; RV32-NEXT: vmv.x.s t3, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 7 ; RV32-NEXT: vsrl.vx v24, v16, a0 ; RV32-NEXT: vmv.x.s s6, v24 -; RV32-NEXT: vmv.x.s t1, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 11 +; RV32-NEXT: vmv.x.s t4, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 8 ; RV32-NEXT: vsrl.vx v24, v16, a0 ; RV32-NEXT: vmv.x.s s7, v24 -; RV32-NEXT: vmv.x.s t2, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 12 +; RV32-NEXT: vmv.x.s t5, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 9 ; RV32-NEXT: vsrl.vx v24, v16, a0 ; RV32-NEXT: vmv.x.s s8, v24 -; RV32-NEXT: vmv.x.s t3, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 13 +; RV32-NEXT: vmv.x.s t6, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 10 ; RV32-NEXT: vsrl.vx v24, v16, a0 ; RV32-NEXT: vmv.x.s s9, v24 -; RV32-NEXT: vmv.x.s t4, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 14 +; RV32-NEXT: vmv.x.s s0, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 11 ; RV32-NEXT: vsrl.vx v24, v16, a0 ; RV32-NEXT: vmv.x.s s10, v24 -; RV32-NEXT: vmv.x.s t5, v16 -; RV32-NEXT: vslidedown.vi v16, v8, 15 +; RV32-NEXT: vmv.x.s s1, v16 +; RV32-NEXT: vslidedown.vi v16, v8, 12 ; RV32-NEXT: vsrl.vx v24, v16, a0 ; RV32-NEXT: vmv.x.s s11, v24 -; RV32-NEXT: vmv.s.x v9, zero +; RV32-NEXT: vmv.x.s s2, v16 +; RV32-NEXT: vslidedown.vi v24, v8, 13 +; RV32-NEXT: vsrl.vx v16, v24, a0 ; RV32-NEXT: vmv.x.s ra, v16 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV32-NEXT: vredxor.vs v8, v8, v9 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v9, v8, a0 -; RV32-NEXT: vmv.x.s a0, v9 -; RV32-NEXT: add a1, a0, t6 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: lw t6, 8(sp) # 4-byte Folded Reload -; RV32-NEXT: add t6, a0, t6 -; RV32-NEXT: sltu a0, t6, a0 -; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: lw a1, 4(sp) # 4-byte Folded Reload -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a2, t6, a2 -; RV32-NEXT: sltu a1, a2, t6 -; RV32-NEXT: add a1, a1, s0 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a3, a2, a3 -; RV32-NEXT: sltu a1, a3, a2 -; RV32-NEXT: add a1, a1, s1 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a4, a3, a4 -; RV32-NEXT: sltu a1, a4, a3 -; RV32-NEXT: add a1, a1, s2 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a5, a4, a5 -; RV32-NEXT: sltu a1, a5, a4 -; RV32-NEXT: add a1, a1, s3 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a6, a5, a6 -; RV32-NEXT: sltu a1, a6, a5 -; RV32-NEXT: add a1, a1, s4 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a7, a6, a7 -; RV32-NEXT: sltu a1, a7, a6 -; RV32-NEXT: add a1, a1, s5 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add t0, a7, t0 -; RV32-NEXT: sltu a1, t0, a7 -; RV32-NEXT: add a1, a1, s6 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: vslidedown.vi v16, v8, 14 +; RV32-NEXT: vsrl.vx v0, v16, a0 +; RV32-NEXT: vslidedown.vi v8, v8, 15 +; RV32-NEXT: vmv.x.s a1, v24 +; RV32-NEXT: vsrl.vx v24, v8, a0 +; RV32-NEXT: lw a0, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: xor a0, a0, a3 +; RV32-NEXT: xor a2, a2, a4 +; RV32-NEXT: add a0, a0, a5 +; RV32-NEXT: add a6, a2, a6 +; RV32-NEXT: sltu a2, a6, a2 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a0, a0, a7 +; RV32-NEXT: add t0, a6, t0 +; RV32-NEXT: sltu a2, t0, a6 +; RV32-NEXT: add a2, a2, s3 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t1, t0, t1 -; RV32-NEXT: sltu a1, t1, t0 -; RV32-NEXT: add a1, a1, s7 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t1, t0 +; RV32-NEXT: add a2, a2, s4 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t2, t1, t2 -; RV32-NEXT: sltu a1, t2, t1 -; RV32-NEXT: add a1, a1, s8 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t2, t1 +; RV32-NEXT: add a2, a2, s5 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t3, t2, t3 -; RV32-NEXT: sltu a1, t3, t2 -; RV32-NEXT: add a1, a1, s9 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t3, t2 +; RV32-NEXT: add a2, a2, s6 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t4, t3, t4 -; RV32-NEXT: sltu a1, t4, t3 -; RV32-NEXT: add a1, a1, s10 -; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: sltu a2, t4, t3 +; RV32-NEXT: add a2, a2, s7 +; RV32-NEXT: add a0, a0, a2 ; RV32-NEXT: add t5, t4, t5 -; RV32-NEXT: sltu a1, t5, t4 -; RV32-NEXT: add a1, a1, s11 +; RV32-NEXT: sltu a2, t5, t4 +; RV32-NEXT: add a2, a2, s8 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add t6, t5, t6 +; RV32-NEXT: sltu a2, t6, t5 +; RV32-NEXT: add a2, a2, s9 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add s0, t6, s0 +; RV32-NEXT: sltu a2, s0, t6 +; RV32-NEXT: add a2, a2, s10 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add s1, s0, s1 +; RV32-NEXT: sltu a2, s1, s0 +; RV32-NEXT: add a2, a2, s11 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add s2, s1, s2 +; RV32-NEXT: sltu a2, s2, s1 +; RV32-NEXT: add a2, a2, ra +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: vmv.x.s a2, v0 +; RV32-NEXT: add a1, s2, a1 +; RV32-NEXT: sltu a3, a1, s2 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: vmv.x.s a3, v16 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: vmv.x.s a2, v24 +; RV32-NEXT: add a3, a1, a3 +; RV32-NEXT: sltu a1, a3, a1 +; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: add a1, a0, a1 -; RV32-NEXT: add a0, t5, ra -; RV32-NEXT: sltu a2, a0, t5 +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: add a0, a3, a0 +; RV32-NEXT: sltu a2, a0, a3 ; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload @@ -1003,52 +1155,56 @@ define i64 @explode_16xi64(<16 x i64> %v) { ; RV64-NEXT: .cfi_def_cfa_offset 256 ; RV64-NEXT: sd ra, 248(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 240(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s2, 232(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: .cfi_offset s2, -24 ; RV64-NEXT: addi s0, sp, 256 ; RV64-NEXT: .cfi_def_cfa s0, 0 ; RV64-NEXT: andi sp, sp, -128 +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64-NEXT: vmv.x.s a0, v8 +; RV64-NEXT: vslidedown.vi v16, v8, 1 +; RV64-NEXT: vmv.x.s a1, v16 ; RV64-NEXT: vsetivli zero, 1, e64, m2, ta, ma ; RV64-NEXT: vslidedown.vi v16, v8, 2 -; RV64-NEXT: vmv.x.s a0, v16 +; RV64-NEXT: vmv.x.s a2, v16 ; RV64-NEXT: vslidedown.vi v16, v8, 3 -; RV64-NEXT: vmv.x.s a1, v16 -; RV64-NEXT: mv a2, sp +; RV64-NEXT: vmv.x.s a3, v16 +; RV64-NEXT: mv a4, sp ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vse64.v v8, (a2) -; RV64-NEXT: ld a2, 32(sp) -; RV64-NEXT: ld a3, 40(sp) -; RV64-NEXT: ld a4, 48(sp) -; RV64-NEXT: ld a5, 56(sp) -; RV64-NEXT: ld a6, 64(sp) -; RV64-NEXT: ld a7, 72(sp) -; RV64-NEXT: ld t0, 80(sp) -; RV64-NEXT: ld t1, 88(sp) -; RV64-NEXT: ld t2, 96(sp) -; RV64-NEXT: ld t3, 104(sp) -; RV64-NEXT: ld t4, 112(sp) -; RV64-NEXT: ld t5, 120(sp) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vredxor.vs v8, v8, v9 -; RV64-NEXT: vmv.x.s t6, v8 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add a0, t6, a0 +; RV64-NEXT: vse64.v v8, (a4) +; RV64-NEXT: ld a4, 32(sp) +; RV64-NEXT: ld a5, 40(sp) +; RV64-NEXT: ld a6, 48(sp) +; RV64-NEXT: ld a7, 56(sp) +; RV64-NEXT: ld t0, 64(sp) +; RV64-NEXT: ld t1, 72(sp) +; RV64-NEXT: ld t2, 80(sp) +; RV64-NEXT: ld t3, 88(sp) +; RV64-NEXT: ld t4, 96(sp) +; RV64-NEXT: ld t5, 104(sp) +; RV64-NEXT: ld t6, 112(sp) +; RV64-NEXT: ld s2, 120(sp) +; RV64-NEXT: xor a0, a0, a1 +; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: add a3, a3, a4 -; RV64-NEXT: add a0, a0, a3 +; RV64-NEXT: add a0, a0, a4 ; RV64-NEXT: add a5, a5, a6 -; RV64-NEXT: add a5, a5, a7 ; RV64-NEXT: add a0, a0, a5 -; RV64-NEXT: add t0, t0, t1 -; RV64-NEXT: add t0, t0, t2 -; RV64-NEXT: add t0, t0, t3 -; RV64-NEXT: add a0, a0, t0 -; RV64-NEXT: add t4, t4, t5 -; RV64-NEXT: add a0, a0, t4 +; RV64-NEXT: add a7, a7, t0 +; RV64-NEXT: add a7, a7, t1 +; RV64-NEXT: add a0, a0, a7 +; RV64-NEXT: add t2, t2, t3 +; RV64-NEXT: add t2, t2, t4 +; RV64-NEXT: add t2, t2, t5 +; RV64-NEXT: add a0, a0, t2 +; RV64-NEXT: add t6, t6, s2 +; RV64-NEXT: add a0, a0, t6 ; RV64-NEXT: addi sp, s0, -256 ; RV64-NEXT: ld ra, 248(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 240(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s2, 232(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 256 ; RV64-NEXT: ret %e0 = extractelement <16 x i64> %v, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll index 8c96392f08a5d..173b70def03d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-formation.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 -; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 +; RUN: llc -mtriple=riscv32 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s | FileCheck %s define i32 @reduce_sum_2xi32(<2 x i32> %v) { ; CHECK-LABEL: reduce_sum_2xi32: @@ -448,336 +448,3 @@ define i32 @reduce_sum_16xi32_prefix15(ptr %p) { %add13 = add i32 %add12, %e14 ret i32 %add13 } - -;; Most of the cornercases are exercised above, the following just -;; makes sure that other opcodes work as expected. - -define i32 @reduce_xor_16xi32_prefix2(ptr %p) { -; CHECK-LABEL: reduce_xor_16xi32_prefix2: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vredxor.vs v8, v8, v9 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %xor0 = xor i32 %e0, %e1 - ret i32 %xor0 -} - -define i32 @reduce_xor_16xi32_prefix5(ptr %p) { -; CHECK-LABEL: reduce_xor_16xi32_prefix5: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 224 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vmv.v.i v8, -1 -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsext.vf4 v12, v8 -; CHECK-NEXT: vand.vv v8, v10, v12 -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vredxor.vs v8, v8, v10 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %e2 = extractelement <16 x i32> %v, i32 2 - %e3 = extractelement <16 x i32> %v, i32 3 - %e4 = extractelement <16 x i32> %v, i32 4 - %xor0 = xor i32 %e0, %e1 - %xor1 = xor i32 %xor0, %e2 - %xor2 = xor i32 %xor1, %e3 - %xor3 = xor i32 %xor2, %e4 - ret i32 %xor3 -} - -define i32 @reduce_and_16xi32_prefix2(ptr %p) { -; CHECK-LABEL: reduce_and_16xi32_prefix2: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vredand.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %and0 = and i32 %e0, %e1 - ret i32 %and0 -} - -define i32 @reduce_and_16xi32_prefix5(ptr %p) { -; CHECK-LABEL: reduce_and_16xi32_prefix5: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v8, -1 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v8, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v8, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v8, 7 -; CHECK-NEXT: vredand.vs v8, v10, v10 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %e2 = extractelement <16 x i32> %v, i32 2 - %e3 = extractelement <16 x i32> %v, i32 3 - %e4 = extractelement <16 x i32> %v, i32 4 - %and0 = and i32 %e0, %e1 - %and1 = and i32 %and0, %e2 - %and2 = and i32 %and1, %e3 - %and3 = and i32 %and2, %e4 - ret i32 %and3 -} - -define i32 @reduce_or_16xi32_prefix2(ptr %p) { -; CHECK-LABEL: reduce_or_16xi32_prefix2: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vredor.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %or0 = or i32 %e0, %e1 - ret i32 %or0 -} - -define i32 @reduce_or_16xi32_prefix5(ptr %p) { -; CHECK-LABEL: reduce_or_16xi32_prefix5: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 224 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vmv.v.i v8, -1 -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsext.vf4 v12, v8 -; CHECK-NEXT: vand.vv v8, v10, v12 -; CHECK-NEXT: vredor.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %e2 = extractelement <16 x i32> %v, i32 2 - %e3 = extractelement <16 x i32> %v, i32 3 - %e4 = extractelement <16 x i32> %v, i32 4 - %or0 = or i32 %e0, %e1 - %or1 = or i32 %or0, %e2 - %or2 = or i32 %or1, %e3 - %or3 = or i32 %or2, %e4 - ret i32 %or3 -} - -declare i32 @llvm.smax.i32(i32 %a, i32 %b) -declare i32 @llvm.smin.i32(i32 %a, i32 %b) -declare i32 @llvm.umax.i32(i32 %a, i32 %b) -declare i32 @llvm.umin.i32(i32 %a, i32 %b) - -define i32 @reduce_smax_16xi32_prefix2(ptr %p) { -; CHECK-LABEL: reduce_smax_16xi32_prefix2: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vredmax.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1) - ret i32 %smax0 -} - -define i32 @reduce_smax_16xi32_prefix5(ptr %p) { -; CHECK-LABEL: reduce_smax_16xi32_prefix5: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, a1 -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v8, v10, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 7 -; CHECK-NEXT: vredmax.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %e2 = extractelement <16 x i32> %v, i32 2 - %e3 = extractelement <16 x i32> %v, i32 3 - %e4 = extractelement <16 x i32> %v, i32 4 - %smax0 = call i32 @llvm.smax.i32(i32 %e0, i32 %e1) - %smax1 = call i32 @llvm.smax.i32(i32 %smax0, i32 %e2) - %smax2 = call i32 @llvm.smax.i32(i32 %smax1, i32 %e3) - %smax3 = call i32 @llvm.smax.i32(i32 %smax2, i32 %e4) - ret i32 %smax3 -} - -define i32 @reduce_smin_16xi32_prefix2(ptr %p) { -; CHECK-LABEL: reduce_smin_16xi32_prefix2: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vredmin.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1) - ret i32 %smin0 -} - -define i32 @reduce_smin_16xi32_prefix5(ptr %p) { -; RV32-LABEL: reduce_smin_16xi32_prefix5: -; RV32: # %bb.0: -; RV32-NEXT: lui a1, 524288 -; RV32-NEXT: addi a1, a1, -1 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: vmv.s.x v10, a1 -; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 5 -; RV32-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; RV32-NEXT: vslideup.vi v8, v10, 6 -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vslideup.vi v8, v10, 7 -; RV32-NEXT: vredmin.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: ret -; -; RV64-LABEL: reduce_smin_16xi32_prefix5: -; RV64: # %bb.0: -; RV64-NEXT: lui a1, 524288 -; RV64-NEXT: addiw a1, a1, -1 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vmv.s.x v10, a1 -; RV64-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 5 -; RV64-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; RV64-NEXT: vslideup.vi v8, v10, 6 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vslideup.vi v8, v10, 7 -; RV64-NEXT: vredmin.vs v8, v8, v8 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %e2 = extractelement <16 x i32> %v, i32 2 - %e3 = extractelement <16 x i32> %v, i32 3 - %e4 = extractelement <16 x i32> %v, i32 4 - %smin0 = call i32 @llvm.smin.i32(i32 %e0, i32 %e1) - %smin1 = call i32 @llvm.smin.i32(i32 %smin0, i32 %e2) - %smin2 = call i32 @llvm.smin.i32(i32 %smin1, i32 %e3) - %smin3 = call i32 @llvm.smin.i32(i32 %smin2, i32 %e4) - ret i32 %smin3 -} - -define i32 @reduce_umax_16xi32_prefix2(ptr %p) { -; CHECK-LABEL: reduce_umax_16xi32_prefix2: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vredmaxu.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1) - ret i32 %umax0 -} - -define i32 @reduce_umax_16xi32_prefix5(ptr %p) { -; CHECK-LABEL: reduce_umax_16xi32_prefix5: -; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 224 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vmv.v.i v8, -1 -; CHECK-NEXT: vmerge.vim v8, v8, 0, v0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsext.vf4 v12, v8 -; CHECK-NEXT: vand.vv v8, v10, v12 -; CHECK-NEXT: vredmaxu.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %e2 = extractelement <16 x i32> %v, i32 2 - %e3 = extractelement <16 x i32> %v, i32 3 - %e4 = extractelement <16 x i32> %v, i32 4 - %umax0 = call i32 @llvm.umax.i32(i32 %e0, i32 %e1) - %umax1 = call i32 @llvm.umax.i32(i32 %umax0, i32 %e2) - %umax2 = call i32 @llvm.umax.i32(i32 %umax1, i32 %e3) - %umax3 = call i32 @llvm.umax.i32(i32 %umax2, i32 %e4) - ret i32 %umax3 -} - -define i32 @reduce_umin_16xi32_prefix2(ptr %p) { -; CHECK-LABEL: reduce_umin_16xi32_prefix2: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vredminu.vs v8, v8, v8 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1) - ret i32 %umin0 -} - -define i32 @reduce_umin_16xi32_prefix5(ptr %p) { -; CHECK-LABEL: reduce_umin_16xi32_prefix5: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v8, -1 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v8, 5 -; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, ma -; CHECK-NEXT: vslideup.vi v10, v8, 6 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v10, v8, 7 -; CHECK-NEXT: vredminu.vs v8, v10, v10 -; CHECK-NEXT: vmv.x.s a0, v8 -; CHECK-NEXT: ret - %v = load <16 x i32>, ptr %p, align 256 - %e0 = extractelement <16 x i32> %v, i32 0 - %e1 = extractelement <16 x i32> %v, i32 1 - %e2 = extractelement <16 x i32> %v, i32 2 - %e3 = extractelement <16 x i32> %v, i32 3 - %e4 = extractelement <16 x i32> %v, i32 4 - %umin0 = call i32 @llvm.umin.i32(i32 %e0, i32 %e1) - %umin1 = call i32 @llvm.umin.i32(i32 %umin0, i32 %e2) - %umin2 = call i32 @llvm.umin.i32(i32 %umin1, i32 %e3) - %umin3 = call i32 @llvm.umin.i32(i32 %umin2, i32 %e4) - ret i32 %umin3 -}