diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 5b2d185594f44..18d3e66bc0763 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -340,6 +340,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {MVT::v16i8, MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v8i16, MVT::v4i16, MVT::v2i16, MVT::v4i32, MVT::v2i32, MVT::v2i64}) { setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); } } @@ -377,6 +378,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); } for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32}) setOperationAction(ISD::BITREVERSE, VT, Custom); @@ -522,10 +524,62 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerFP_TO_BF16(Op, DAG); case ISD::BF16_TO_FP: return lowerBF16_TO_FP(Op, DAG); + case ISD::VECREDUCE_ADD: + return lowerVECREDUCE_ADD(Op, DAG); } return SDValue(); } +// Lower vecreduce_add using vhaddw instructions. +// For Example: +// call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) +// can be lowered to: +// VHADDW_D_W vr0, vr0, vr0 +// VHADDW_Q_D vr0, vr0, vr0 +// VPICKVE2GR_D a0, vr0, 0 +// ADDI_W a0, a0, 0 +SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + MVT OpVT = Op.getSimpleValueType(); + SDValue Val = Op.getOperand(0); + + unsigned NumEles = Val.getSimpleValueType().getVectorNumElements(); + unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits(); + + unsigned LegalVecSize = 128; + bool isLASX256Vector = + Subtarget.hasExtLASX() && Val.getValueSizeInBits() == 256; + + // Ensure operand type legal or enable it legal. + while (!isTypeLegal(Val.getSimpleValueType())) { + Val = DAG.WidenVector(Val, DL); + } + + // NumEles is designed for iterations count, v4i32 for LSX + // and v8i32 for LASX should have the same count. + if (isLASX256Vector) { + NumEles /= 2; + LegalVecSize = 256; + } + + for (unsigned i = 1; i < NumEles; i *= 2, EleBits *= 2) { + MVT IntTy = MVT::getIntegerVT(EleBits); + MVT VecTy = MVT::getVectorVT(IntTy, LegalVecSize / EleBits); + Val = DAG.getNode(LoongArchISD::VHADDW, DL, VecTy, Val, Val); + } + + if (isLASX256Vector) { + SDValue Tmp = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, Val, + DAG.getConstant(2, DL, MVT::i64)); + Val = DAG.getNode(ISD::ADD, DL, MVT::v4i64, Tmp, Val); + } + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val, + DAG.getConstant(0, DL, Subtarget.getGRLenVT())); +} + SDValue LoongArchTargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { unsigned IsData = Op.getConstantOperandVal(4); @@ -6659,6 +6713,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(XVMSKGEZ) NODE_NAME_CASE(XVMSKEQZ) NODE_NAME_CASE(XVMSKNEZ) + NODE_NAME_CASE(VHADDW) } #undef NODE_NAME_CASE return nullptr; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index f79ba7450cc36..40e237b1c69e4 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -177,6 +177,9 @@ enum NodeType : unsigned { XVMSKEQZ, XVMSKNEZ, + // Vector Horizontal Addition with Widening‌ + VHADDW + // Intrinsic operations end ============================================= }; } // end namespace LoongArchISD @@ -386,6 +389,7 @@ class LoongArchTargetLowering : public TargetLowering { SDValue lowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 0696b11d62ac9..cf63750461edd 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1186,6 +1186,17 @@ multiclass PatXrXrXr { (!cast(Inst#"_D") LASX256:$xd, LASX256:$xj, LASX256:$xk)>; } +multiclass PatXrXrW { + def : Pat<(OpNode(v32i8 LASX256:$vj), (v32i8 LASX256:$vk)), + (!cast(Inst#"_H_B") LASX256:$vj, LASX256:$vk)>; + def : Pat<(OpNode(v16i16 LASX256:$vj), (v16i16 LASX256:$vk)), + (!cast(Inst#"_W_H") LASX256:$vj, LASX256:$vk)>; + def : Pat<(OpNode(v8i32 LASX256:$vj), (v8i32 LASX256:$vk)), + (!cast(Inst#"_D_W") LASX256:$vj, LASX256:$vk)>; + def : Pat<(OpNode(v4i64 LASX256:$vj), (v4i64 LASX256:$vk)), + (!cast(Inst#"_Q_D") LASX256:$vj, LASX256:$vk)>; +} + multiclass PatShiftXrXr { def : Pat<(OpNode (v32i8 LASX256:$xj), (and vsplati8_imm_eq_7, (v32i8 LASX256:$xk))), @@ -1513,6 +1524,9 @@ def : Pat<(bswap (v8i32 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b00011011)>; def : Pat<(bswap (v4i64 LASX256:$xj)), (XVSHUF4I_W (XVSHUF4I_B LASX256:$xj, 0b00011011), 0b10110001)>; +// XVHADDW_{H_B/W_H/D_W/Q_D} +defm : PatXrXrW; + // XVFADD_{S/D} defm : PatXrXrF; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 3c9defb0366ff..168a60004fc0f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -71,6 +71,8 @@ def loongarch_vsrli : SDNode<"LoongArchISD::VSRLI", SDT_LoongArchV1RUimm>; def loongarch_vbsll : SDNode<"LoongArchISD::VBSLL", SDT_LoongArchV1RUimm>; def loongarch_vbsrl : SDNode<"LoongArchISD::VBSRL", SDT_LoongArchV1RUimm>; +def loongarch_vhaddw : SDNode<"LoongArchISD::VHADDW", SDT_LoongArchV2R>; + def loongarch_vldrepl : SDNode<"LoongArchISD::VLDREPL", SDT_LoongArchVLDREPL, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -1364,6 +1366,17 @@ multiclass PatVrVrVr { (!cast(Inst#"_D") LSX128:$vd, LSX128:$vj, LSX128:$vk)>; } +multiclass PatVrVrW { + def : Pat<(OpNode(v16i8 LSX128:$vj), (v16i8 LSX128:$vk)), + (!cast(Inst#"_H_B") LSX128:$vj, LSX128:$vk)>; + def : Pat<(OpNode(v8i16 LSX128:$vj), (v8i16 LSX128:$vk)), + (!cast(Inst#"_W_H") LSX128:$vj, LSX128:$vk)>; + def : Pat<(OpNode(v4i32 LSX128:$vj), (v4i32 LSX128:$vk)), + (!cast(Inst#"_D_W") LSX128:$vj, LSX128:$vk)>; + def : Pat<(OpNode(v2i64 LSX128:$vj), (v2i64 LSX128:$vk)), + (!cast(Inst#"_Q_D") LSX128:$vj, LSX128:$vk)>; +} + multiclass PatShiftVrVr { def : Pat<(OpNode (v16i8 LSX128:$vj), (and vsplati8_imm_eq_7, (v16i8 LSX128:$vk))), @@ -1709,6 +1722,9 @@ def : Pat<(bswap (v4i32 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b00011011)>; def : Pat<(bswap (v2i64 LSX128:$vj)), (VSHUF4I_W (VSHUF4I_B LSX128:$vj, 0b00011011), 0b10110001)>; +// VHADDW_{H_B/W_H/D_W/Q_D} +defm : PatVrVrW; + // VFADD_{S/D} defm : PatVrVrF; diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp index ede5477f04bda..efe898c33072e 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp @@ -95,4 +95,13 @@ unsigned LoongArchTTIImpl::getPrefetchDistance() const { return 200; } bool LoongArchTTIImpl::enableWritePrefetching() const { return true; } +bool LoongArchTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { + switch (II->getIntrinsicID()) { + default: + return true; + case Intrinsic::vector_reduce_add: + return false; + } +} + // TODO: Implement more hooks to provide TTI machinery for LoongArch. diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h index d43d2cb0eb124..e3f16c7804994 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h @@ -53,6 +53,8 @@ class LoongArchTTIImpl : public BasicTTIImplBase { unsigned getPrefetchDistance() const override; bool enableWritePrefetching() const override; + bool shouldExpandReduction(const IntrinsicInst *II) const override; + // TODO: Implement more hooks to provide TTI machinery for LoongArch. }; diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll index bf5effd7b3912..7268eb24ee51c 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll @@ -1,27 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 - ; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s define void @vec_reduce_add_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_add_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 78 -; CHECK-NEXT: xvshuf4i.b $xr1, $xr1, 228 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 8 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvsrli.d $xr1, $xr1, 32 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvshuf4i.b $xr1, $xr1, 14 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvrepl128vei.b $xr1, $xr1, 1 -; CHECK-NEXT: xvadd.b $xr0, $xr0, $xr1 -; CHECK-NEXT: xvstelm.b $xr0, $a1, 0, 0 +; CHECK-NEXT: xvhaddw.h.b $xr0, $xr0, $xr0 +; CHECK-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; CHECK-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 +; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 +; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; CHECK-NEXT: st.b $a0, $a1, 0 ; CHECK-NEXT: ret %v = load <32 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %v) @@ -33,19 +24,13 @@ define void @vec_reduce_add_v16i16(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_add_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 78 -; CHECK-NEXT: xvshuf4i.h $xr1, $xr1, 228 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 8 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvshuf4i.h $xr1, $xr1, 14 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvrepl128vei.h $xr1, $xr1, 1 -; CHECK-NEXT: xvadd.h $xr0, $xr0, $xr1 -; CHECK-NEXT: xvstelm.h $xr0, $a1, 0, 0 +; CHECK-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; CHECK-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 +; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 +; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; CHECK-NEXT: st.h $a0, $a1, 0 ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %v) @@ -57,16 +42,12 @@ define void @vec_reduce_add_v8i32(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_add_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 78 -; CHECK-NEXT: xvshuf4i.w $xr1, $xr1, 228 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvshuf4i.w $xr1, $xr1, 14 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvrepl128vei.w $xr1, $xr1, 1 -; CHECK-NEXT: xvadd.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvstelm.w $xr0, $a1, 0, 0 +; CHECK-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 +; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 +; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; CHECK-NEXT: st.w $a0, $a1, 0 ; CHECK-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v) @@ -78,14 +59,9 @@ define void @vec_reduce_add_v4i64(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_add_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI3_0) -; CHECK-NEXT: xvpermi.d $xr2, $xr0, 78 -; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr2 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68 -; CHECK-NEXT: xvrepl128vei.d $xr1, $xr1, 1 -; CHECK-NEXT: xvadd.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 +; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 ; CHECK-NEXT: xvstelm.d $xr0, $a1, 0, 0 ; CHECK-NEXT: ret %v = load <4 x i64>, ptr %src @@ -93,4 +69,3 @@ define void @vec_reduce_add_v4i64(ptr %src, ptr %dst) nounwind { store i64 %res, ptr %dst ret void } - diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll index a71bdea917cba..57fd09ed2e09b 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll @@ -5,15 +5,12 @@ define void @vec_reduce_add_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_add_v16i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vsrli.d $vr1, $vr0, 32 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vshuf4i.b $vr1, $vr0, 14 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vreplvei.b $vr1, $vr0, 1 -; CHECK-NEXT: vadd.b $vr0, $vr0, $vr1 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 +; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 +; CHECK-NEXT: st.b $a0, $a1, 0 ; CHECK-NEXT: ret %v = load <16 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) @@ -21,17 +18,62 @@ define void @vec_reduce_add_v16i8(ptr %src, ptr %dst) nounwind { ret void } +define void @vec_reduce_add_v8i8(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: vec_reduce_add_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.d $a0, $a0, 0 +; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0 +; CHECK-NEXT: st.b $a0, $a1, 0 +; CHECK-NEXT: ret + %v = load <8 x i8>, ptr %src + %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) + store i8 %res, ptr %dst + ret void +} + +define void @vec_reduce_add_v4i8(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: vec_reduce_add_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.w $a0, $a0, 0 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 0 +; CHECK-NEXT: st.b $a0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i8>, ptr %src + %res = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %v) + store i8 %res, ptr %dst + ret void +} + +define void @vec_reduce_add_v2i8(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: vec_reduce_add_v2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.h $a0, $a0, 0 +; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0 +; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 +; CHECK-NEXT: ret + %v = load <2 x i8>, ptr %src + %res = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v) + store i8 %res, ptr %dst + ret void +} + define void @vec_reduce_add_v8i16(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_add_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vshuf4i.h $vr1, $vr0, 14 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vreplvei.h $vr1, $vr0, 1 -; CHECK-NEXT: vadd.h $vr0, $vr0, $vr1 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 +; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 +; CHECK-NEXT: st.h $a0, $a1, 0 ; CHECK-NEXT: ret %v = load <8 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) @@ -39,15 +81,44 @@ define void @vec_reduce_add_v8i16(ptr %src, ptr %dst) nounwind { ret void } +define void @vec_reduce_add_v4i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: vec_reduce_add_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.d $a0, $a0, 0 +; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0 +; CHECK-NEXT: st.h $a0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i16>, ptr %src + %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) + store i16 %res, ptr %dst + ret void +} + +define void @vec_reduce_add_v2i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: vec_reduce_add_v2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.w $a0, $a0, 0 +; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 +; CHECK-NEXT: ret + %v = load <2 x i16>, ptr %src + %res = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %v) + store i16 %res, ptr %dst + ret void +} + define void @vec_reduce_add_v4i32(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_add_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.w $vr1, $vr0, 14 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 1 -; CHECK-NEXT: vadd.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 +; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 +; CHECK-NEXT: st.w $a0, $a1, 0 ; CHECK-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) @@ -55,12 +126,25 @@ define void @vec_reduce_add_v4i32(ptr %src, ptr %dst) nounwind { ret void } +define void @vec_reduce_add_v2i32(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: vec_reduce_add_v2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.d $a0, $a0, 0 +; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 +; CHECK-NEXT: ret + %v = load <2 x i32>, ptr %src + %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %v) + store i32 %res, ptr %dst + ret void +} + define void @vec_reduce_add_v2i64(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_add_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: vadd.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 ; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 ; CHECK-NEXT: ret %v = load <2 x i64>, ptr %src