diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index d9d6f0bcdcb84..f935442404cde 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1186,6 +1186,12 @@ class SelectionDAG { LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, bool ConstantFold = true); + /// Return a vector with the first 'Len' lanes set to true and remaining lanes + /// set to false. The mask's ValueType is the same as when comparing vectors + /// of type VT. + LLVM_ABI SDValue getMaskFromElementCount(const SDLoc &DL, EVT VT, + ElementCount Len); + /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc. SDValue getGLOBAL_OFFSET_TABLE(EVT VT) { return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 87d5453cd98cf..bcaac40de5459 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -6201,8 +6201,33 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts); } - report_fatal_error("Don't know how to widen the result of " - "EXTRACT_SUBVECTOR for scalable vectors"); + // Fallback to extracting through memory. + + Align Alignment = DAG.getReducedAlign(InVT, /*UseABI=*/false); + SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Alignment); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, + LocationSize::beforeOrAfterPointer(), Alignment); + MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, + LocationSize::beforeOrAfterPointer(), Alignment); + + // Write out the input vector. + SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, StoreMMO); + + // Build a mask to match the length of the non-widened result. + SDValue Mask = + DAG.getMaskFromElementCount(dl, WidenVT, VT.getVectorElementCount()); + + // Read back the sub-vector setting the remaining lanes to poison. + StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, InVT, VT, Idx); + return DAG.getMaskedLoad( + WidenVT, dl, Ch, StackPtr, DAG.getUNDEF(StackPtr.getValueType()), Mask, + DAG.getPOISON(WidenVT), VT, LoadMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD); } // We could try widening the input to the right length but for now, extract @@ -6306,11 +6331,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) { if (VT.isVector()) { // If all else fails replace the load with a wide masked load. SDLoc DL(N); - EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); - - SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount()); - SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT, - DAG.getConstant(0, DL, IdxVT), Len); + SDValue Mask = + DAG.getMaskFromElementCount(DL, WideVT, VT.getVectorElementCount()); SDValue NewLoad = DAG.getMaskedLoad( WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask, @@ -7447,9 +7469,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) { SDValue InVec = N->getOperand(0); EVT OrigVT = SubVec.getValueType(); - if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector) - SubVec = GetWidenedVector(SubVec); - + SubVec = GetWidenedVector(SubVec); EVT SubVT = SubVec.getValueType(); // Whether or not all the elements of the widened SubVec will be inserted into @@ -7471,17 +7491,52 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) { } } + if (!IndicesValid) + report_fatal_error( + "Don't know how to widen the operands for INSERT_SUBVECTOR"); + SDLoc DL(N); // We need to make sure that the indices are still valid, otherwise we might // widen what was previously well-defined to something undefined. - if (IndicesValid && InVec.isUndef() && N->getConstantOperandVal(2) == 0) + if (InVec.isUndef() && N->getConstantOperandVal(2) == 0) return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, InVec, SubVec, N->getOperand(2)); - if (!IndicesValid || OrigVT.isScalableVector()) - report_fatal_error( - "Don't know how to widen the operands for INSERT_SUBVECTOR"); + if (OrigVT.isScalableVector()) { + // Fallback to inserting through memory. + + Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false); + SDValue StackPtr = DAG.CreateStackTemporary(VT.getStoreSize(), Alignment); + auto &MF = DAG.getMachineFunction(); + auto FrameIndex = cast(StackPtr.getNode())->getIndex(); + auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); + + MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, + LocationSize::beforeOrAfterPointer(), Alignment); + MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, + LocationSize::beforeOrAfterPointer(), Alignment); + + // Write out the vector being inserting into. + SDValue Ch = + DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, StoreMMO); + + // Build a mask to match the length of the sub-vector. + SDValue Mask = + DAG.getMaskFromElementCount(DL, SubVT, OrigVT.getVectorElementCount()); + + // Overwrite the sub-vector at the required offset. + StackPtr = + TLI.getVectorSubVecPointer(DAG, StackPtr, VT, OrigVT, N->getOperand(2)); + Ch = DAG.getMaskedStore(Ch, DL, SubVec, StackPtr, + DAG.getUNDEF(StackPtr.getValueType()), Mask, VT, + StoreMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD); + + // Read back the result. + return DAG.getLoad(VT, DL, Ch, StackPtr, LoadMMO); + } // If the operands can't be widened legally, just replace the INSERT_SUBVECTOR // with a series of INSERT_VECTOR_ELT @@ -7560,12 +7615,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { if (StVT.isVector()) { // If all else fails replace the store with a wide masked store. SDLoc DL(N); - EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); - SDValue WideStVal = GetWidenedVector(StVal); - SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount()); - SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT, - DAG.getConstant(0, DL, IdxVT), Len); + SDValue Mask = + DAG.getMaskFromElementCount(DL, WideVT, StVT.getVectorElementCount()); return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(), ST->getOffset(), Mask, ST->getMemoryVT(), diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 95f53fe0bfdba..d976c0ce1b901 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2112,6 +2112,14 @@ SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, return getConstant(EC.getKnownMinValue(), DL, VT); } +SDValue SelectionDAG::getMaskFromElementCount(const SDLoc &DL, EVT DataVT, + ElementCount EC) { + EVT IdxVT = TLI->getVectorIdxTy(getDataLayout()); + EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), DataVT); + return getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, + getConstant(0, DL, IdxVT), getElementCount(DL, IdxVT, EC)); +} + SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) { APInt One(ResVT.getScalarSizeInBits(), 1); return getStepVector(DL, ResVT, One); diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll index 4aaa25e5e66c5..8d0c71502f1e3 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll @@ -3,12 +3,199 @@ ; Extracting illegal subvectors -define @extract_nxv1i32_nxv4i32( %vec) nounwind { -; CHECK-LABEL: extract_nxv1i32_nxv4i32: +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1i32_nxv4i32_0( %vec) nounwind { +; CHECK-LABEL: extract_nxv1i32_nxv4i32_0: ; CHECK: // %bb.0: ; CHECK-NEXT: ret - %retval = call @llvm.vector.extract.nxv1i32.nxv4i32( %vec, i64 0) - ret %retval + %e = call @llvm.vector.extract.nxv1i32.nxv4i32( %vec, i64 0) + %retval = call @llvm.vector.insert.nxv4i32.nxv1i32( poison, %e, i64 0) + ret %retval +} + +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1i32_nxv4i32_1( %vec) nounwind { +; CHECK-LABEL: extract_nxv1i32_nxv4i32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %e = call @llvm.vector.extract.nxv1i32.nxv4i32( %vec, i64 1) + %retval = call @llvm.vector.insert.nxv4i32.nxv1i32( poison, %e, i64 0) + ret %retval +} + +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1i32_nxv4i32_2( %vec) nounwind { +; CHECK-LABEL: extract_nxv1i32_nxv4i32_2: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %e = call @llvm.vector.extract.nxv1i32.nxv4i32( %vec, i64 2) + %retval = call @llvm.vector.insert.nxv4i32.nxv1i32( poison, %e, i64 0) + ret %retval +} + +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1i32_nxv4i32_3( %vec) nounwind { +; CHECK-LABEL: extract_nxv1i32_nxv4i32_3: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: cntw x8, all, mul #3 +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %e = call @llvm.vector.extract.nxv1i32.nxv4i32( %vec, i64 3) + %retval = call @llvm.vector.insert.nxv4i32.nxv1i32( poison, %e, i64 0) + ret %retval +} + +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1f32_nxv2f32_0( %vec) nounwind { +; CHECK-LABEL: extract_nxv1f32_nxv2f32_0: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %e = call @llvm.vector.extract.nxv1f32.nxv2f32( %vec, i64 0) + %retval = call @llvm.vector.insert.nxv2f32.nxv1f32( poison, %e, i64 0) + ret %retval +} + +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1f32_nxv2f32_1( %vec) nounwind { +; CHECK-LABEL: extract_nxv1f32_nxv2f32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: addpl x9, sp, #4 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %e = call @llvm.vector.extract.nxv1f32.nxv2f32( %vec, i64 1) + %retval = call @llvm.vector.insert.nxv2f32.nxv1f32( poison, %e, i64 0) + ret %retval +} + +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1f32_nxv4f32_0( %vec) nounwind { +; CHECK-LABEL: extract_nxv1f32_nxv4f32_0: +; CHECK: // %bb.0: +; CHECK-NEXT: ret + %e = call @llvm.vector.extract.nxv1f32.nxv4f32( %vec, i64 0) + %retval = call @llvm.vector.insert.nxv4f32.nxv1f32( poison, %e, i64 0) + ret %retval +} + +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1f32_nxv4f32_1( %vec) nounwind { +; CHECK-LABEL: extract_nxv1f32_nxv4f32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %e = call @llvm.vector.extract.nxv1f32.nxv4f32( %vec, i64 1) + %retval = call @llvm.vector.insert.nxv4f32.nxv1f32( poison, %e, i64 0) + ret %retval +} + +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1f32_nxv4f32_2( %vec) nounwind { +; CHECK-LABEL: extract_nxv1f32_nxv4f32_2: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %e = call @llvm.vector.extract.nxv1f32.nxv4f32( %vec, i64 2) + %retval = call @llvm.vector.insert.nxv4f32.nxv1f32( poison, %e, i64 0) + ret %retval +} + +; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined +; calling convention. +define @extract_nxv1f32_nxv4f32_3( %vec) nounwind { +; CHECK-LABEL: extract_nxv1f32_nxv4f32_3: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: cntw x8, all, mul #3 +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %e = call @llvm.vector.extract.nxv1f32.nxv4f32( %vec, i64 3) + %retval = call @llvm.vector.insert.nxv4f32.nxv1f32( poison, %e, i64 0) + ret %retval } define @extract_nxv1i16_nxv6i16( %vec) nounwind { @@ -19,9 +206,6 @@ define @extract_nxv1i16_nxv6i16( %vec) noun ret %retval } -declare @llvm.vector.extract.nxv1i32.nxv4i32(, i64) -declare @llvm.vector.extract.nxv1i16.nxv6i16(, i64) - ; ; Extract half i1 vector that needs promotion from legal type. ; @@ -43,8 +227,6 @@ define @extract_nxv8i1_nxv16i1_8( %in) { ret %res } -declare @llvm.vector.extract.nxv8i1.nxv16i1(, i64) - ; ; Extract i1 vector that needs widening from one that needs widening. ; @@ -99,8 +281,6 @@ define @extract_nxv14i1_nxv28i1_14( %in) uw ret %res } -declare @llvm.vector.extract.nxv14i1.nxv28i1(, i64) - ; ; Extract half i1 vector that needs promotion from one that needs splitting. ; @@ -140,8 +320,6 @@ define @extract_nxv8i1_nxv32i1_24( %in) { ret %res } -declare @llvm.vector.extract.nxv8i1.nxv32i1(, i64) - ; ; Extract 1/4th i1 vector that needs promotion from legal type. ; @@ -185,8 +363,6 @@ define @extract_nxv4i1_nxv16i1_12( %in) { ret %res } -declare @llvm.vector.extract.nxv4i1.nxv16i1(, i64) - ; ; Extract 1/8th i1 vector that needs promotion from legal type. ; @@ -278,8 +454,6 @@ define @extract_nxv2i1_nxv16i1_14( %in) { ret %res } -declare @llvm.vector.extract.nxv2i1.nxv16i1(, i64) - ; ; Extract i1 vector that needs promotion from one that needs widening. ; @@ -313,8 +487,6 @@ define @extract_nxv4i1_nxv12i1_8( %in) { ret %res } -declare @llvm.vector.extract.nxv4i1.nxv12i1(, i64) - ; ; Extract 1/8th i8 vector that needs promotion from legal type. ; @@ -406,8 +578,6 @@ define @extract_nxv2i8_nxv16i8_14( %in) { ret %res } -declare @llvm.vector.extract.nxv2i8.nxv16i8(, i64) - ; ; Extract i8 vector that needs promotion from one that needs widening. ; @@ -441,8 +611,6 @@ define @extract_nxv4i8_nxv12i8_8( %in) { ret %res } -declare @llvm.vector.extract.nxv4i8.nxv12i8(, i64) - ; ; Extract i8 vector that needs both widening + promotion from one that needs widening. ; (nxv6i8 -> nxv8i8 -> nxv8i16) @@ -474,8 +642,6 @@ define @extract_nxv6i8_nxv12i8_6( %in) { ret %res } -declare @llvm.vector.extract.nxv6i8.nxv12i8(, i64) - ; ; Extract half i8 vector that needs promotion from one that needs splitting. ; @@ -515,8 +681,6 @@ define @extract_nxv8i8_nxv32i8_24( %in) { ret %res } -declare @llvm.vector.extract.nxv8i8.nxv32i8(, i64) - ; ; Extract half i8 vector that needs promotion from legal type. ; @@ -538,8 +702,6 @@ define @extract_nxv8i8_nxv16i8_8( %in) { ret %res } -declare @llvm.vector.extract.nxv8i8.nxv16i8(, i64) - ; ; Extract i8 vector that needs widening from one that needs widening. ; @@ -625,8 +787,6 @@ define @extract_nxv14i8_nxv28i8_14( %in) { ret %res } -declare @llvm.vector.extract.nxv14i8.nxv28i8(, i64) - ; ; Extract 1/4th i8 vector that needs promotion from legal type. ; @@ -670,8 +830,6 @@ define @extract_nxv4i8_nxv16i8_12( %in) { ret %res } -declare @llvm.vector.extract.nxv4i8.nxv16i8(, i64) - ; ; Extract f16 vector that needs promotion from one that needs widening. ; @@ -705,8 +863,6 @@ define @extract_nxv2f16_nxv6f16_4( %in) { ret %res } -declare @llvm.vector.extract.nxv2f16.nxv6f16(, i64) - ; ; Extract half f16 vector that needs promotion from legal type. ; @@ -728,8 +884,6 @@ define @extract_nxv4f16_nxv8f16_4( %in) { ret %res } -declare @llvm.vector.extract.nxv4f16.nxv8f16(, i64) - ; ; Extract f16 vector that needs widening from one that needs widening. ; @@ -757,8 +911,6 @@ define @extract_nxv6f16_nxv12f16_6( %in) ret %res } -declare @llvm.vector.extract.nxv6f16.nxv12f16(, i64) - ; ; Extract half f16 vector that needs promotion from one that needs splitting. ; @@ -798,8 +950,6 @@ define @extract_nxv4f16_nxv16f16_12( %in ret %res } -declare @llvm.vector.extract.nxv4f16.nxv16f16(, i64) - ; ; Extract 1/4th f16 vector that needs promotion from legal type. ; @@ -843,8 +993,6 @@ define @extract_nxv2f16_nxv8f16_6( %in) { ret %res } -declare @llvm.vector.extract.nxv2f16.nxv8f16(, i64) - ; ; Extract half bf16 vector that needs promotion from legal type. ; @@ -866,8 +1014,6 @@ define @extract_nxv4bf16_nxv8bf16_4( ret %res } -declare @llvm.vector.extract.nxv4bf16.nxv8bf16(, i64) - ; ; Extract bf16 vector that needs widening from one that needs widening. ; @@ -895,8 +1041,6 @@ define @extract_nxv6bf16_nxv12bf16_6( %res } -declare @llvm.vector.extract.nxv6bf16.nxv12bf16(, i64) - ; ; Extract bf16 vector that needs promotion from one that needs widening. ; @@ -930,8 +1074,6 @@ define @extract_nxv2bf16_nxv6bf16_4( ret %res } -declare @llvm.vector.extract.nxv2bf16.nxv6bf16(, i64) - ; ; Extract 1/4th bf16 vector that needs promotion from legal type. ; @@ -975,8 +1117,6 @@ define @extract_nxv2bf16_nxv8bf16_6( ret %res } -declare @llvm.vector.extract.nxv2bf16.nxv8bf16(, i64) - ; ; Extract half bf16 vector that needs promotion from one that needs splitting. ; @@ -1016,9 +1156,6 @@ define @extract_nxv4bf16_nxv16bf16_12( %res } -declare @llvm.vector.extract.nxv4bf16.nxv16bf16(, i64) - - ; ; Extract from a splat ; @@ -1070,9 +1207,6 @@ define @extract_nxv2i1_nxv16i1_all_zero() { ret %ext } -declare @llvm.vector.extract.nxv2f32.nxv4f32(, i64) -declare @llvm.vector.extract.nxv4i32.nxv8i32(, i64) - ; ; Extract nxv1i1 type from: nxv2i1 ; @@ -1427,8 +1561,3 @@ define @extract_nxv1i1_nxv16i1_15( %in) { %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 15) ret %res } - -declare @llvm.vector.extract.nxv1i1.nxv2i1(, i64) -declare @llvm.vector.extract.nxv1i1.nxv4i1(, i64) -declare @llvm.vector.extract.nxv1i1.nxv8i1(, i64) -declare @llvm.vector.extract.nxv1i1.nxv16i1(, i64) diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll index 73c783d4735f8..26b4739ad4e61 100644 --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -1322,49 +1322,246 @@ define @insert_nxv1i1_nxv16i1_15( %vec, %res } -attributes #0 = { vscale_range(2,2) } +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1i32_nxv4i32_0( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1i32_nxv4i32_0: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: st1w { z1.s }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv4i32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv4i32.nxv1i32( %vec, %i, i64 0) + ret %retval +} + +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1i32_nxv4i32_1( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1i32_nxv4i32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x10, x9, x8 +; CHECK-NEXT: st1w { z1.s }, p0, [x10] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv4i32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv4i32.nxv1i32( %vec, %i, i64 1) + ret %retval +} + +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1i32_nxv4i32_2( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1i32_nxv4i32_2: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: add x10, x9, x8 +; CHECK-NEXT: st1w { z1.s }, p0, [x10] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv4i32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv4i32.nxv1i32( %vec, %i, i64 2) + ret %retval +} + +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1i32_nxv4i32_3( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1i32_nxv4i32_3: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: cntw x8, all, mul #3 +; CHECK-NEXT: add x10, x9, x8 +; CHECK-NEXT: st1w { z1.s }, p0, [x10] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv4i32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv4i32.nxv1i32( %vec, %i, i64 3) + ret %retval +} + +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1f32_nxv2f32_0( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1f32_nxv2f32_0: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: st1w { z1.d }, p1, [sp, #1, mul vl] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv2f32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv2f32.nxv1f32( %vec, %i, i64 0) + ret %retval +} + +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1f32_nxv2f32_1( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1f32_nxv2f32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: addpl x9, sp, #4 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: st1w { z1.d }, p1, [x8] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv2f32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv2f32.nxv1f32( %vec, %i, i64 1) + ret %retval +} -declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) - -declare @llvm.vector.insert.nxv6i16.nxv1i16(, , i64) -declare @llvm.vector.insert.nxv8i16.nxv2i16(, , i64) -declare @llvm.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) - -declare @llvm.vector.insert.nxv3i32.nxv2i32(, , i64) -declare @llvm.vector.insert.nxv4i32.nxv1i32(, , i64) -declare @llvm.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) -declare @llvm.vector.insert.nxv4i32.nxv12i32(, , i64) -declare @llvm.vector.insert.nxv6i32.nxv2i32(, , i64) -declare @llvm.vector.insert.nxv6i32.nxv3i32(, , i64) - -declare @llvm.vector.insert.nxv2bf16.nxv2bf16(, , i64) -declare @llvm.vector.insert.nxv4bf16.nxv2bf16(, , i64) -declare @llvm.vector.insert.nxv4bf16.nxv4bf16(, , i64) -declare @llvm.vector.insert.nxv4bf16.v4bf16(, <4 x bfloat>, i64) -declare @llvm.vector.insert.nxv8bf16.nxv8bf16(, , i64) -declare @llvm.vector.insert.nxv8bf16.nxv4bf16(, , i64) -declare @llvm.vector.insert.nxv8bf16.v8bf16(, <8 x bfloat>, i64) - -declare @llvm.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) -declare @llvm.vector.insert.nxv2i64.v4i64(, <4 x i64>, i64) -declare @llvm.vector.insert.nxv8i64.nxv16i64(, , i64) -declare @llvm.vector.insert.v2i64.nxv16i64(, <2 x i64>, i64) - -declare @llvm.vector.insert.nxv4f16.nxv2f16(, , i64) -declare @llvm.vector.insert.nxv8f16.nxv2f16(, , i64) -declare @llvm.vector.insert.nxv8f16.nxv4f16(, , i64) - -declare @llvm.vector.insert.nxv3f32.nxv2f32(, , i64) -declare @llvm.vector.insert.nxv4f32.nxv1f32(, , i64) -declare @llvm.vector.insert.nxv4f32.nxv2f32(, , i64) - -declare @llvm.vector.insert.nxv2i1.v8i1(, <8 x i1>, i64) -declare @llvm.vector.insert.nxv4i1.v16i1(, <16 x i1>, i64) -declare @llvm.vector.insert.nxv8i1.v32i1(, <32 x i1>, i64) -declare @llvm.vector.insert.nxv16i1.nxv1i1(, , i64) -declare @llvm.vector.insert.nxv8i1.nxv1i1(, , i64) -declare @llvm.vector.insert.nxv4i1.nxv1i1(, , i64) -declare @llvm.vector.insert.nxv2i1.nxv1i1(, , i64) -declare @llvm.vector.insert.nxv16i1.nxv4i1(, , i64) -declare @llvm.vector.insert.nxv16i1.nxv8i1(, , i64) -declare @llvm.vector.insert.nxv16i1.v64i1(, <64 x i1>, i64) +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1f32_nxv4f32_0( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1f32_nxv4f32_0: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: st1w { z1.d }, p0, [sp] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv4f32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv4f32.nxv1f32( %vec, %i, i64 0) + ret %retval +} + +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1f32_nxv4f32_1( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1f32_nxv4f32_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: add x10, x9, x8 +; CHECK-NEXT: st1w { z1.d }, p0, [x10] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv4f32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv4f32.nxv1f32( %vec, %i, i64 1) + ret %retval +} + +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1f32_nxv4f32_2( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1f32_nxv4f32_2: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: st1w { z1.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv4f32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv4f32.nxv1f32( %vec, %i, i64 2) + ret %retval +} + +; NOTE: Extract input sub-vector from a legal type to avoid relying on an +; undefined calling convention. +define @insert_nxv1f32_nxv4f32_3( %vec, %subvec) nounwind { +; CHECK-LABEL: insert_nxv1f32_nxv4f32_3: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: str z0, [sp] +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: cntw x8, all, mul #3 +; CHECK-NEXT: add x10, x9, x8 +; CHECK-NEXT: st1w { z1.d }, p0, [x10] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %i = call @llvm.vector.extract.nxv1i64.nxv4f32( %subvec, i64 0) + %retval = call @llvm.vector.insert.nxv4f32.nxv1f32( %vec, %i, i64 3) + ret %retval +} + +attributes #0 = { vscale_range(2,2) }