-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[LLVM][CodeGen] Remove failure cases when widening EXTRACT/INSERT_SUBVECTOR. #162308
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
…VECTOR. This PR implements catch all handling for widening the scalable subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It does this via the stack using masked memory operations. With general handling available we can add optimiations for specific cases.
@llvm/pr-subscribers-backend-aarch64 Author: Paul Walker (paulwalker-arm) ChangesThis PR implements catch all handling for widening the scalable subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It does this via the stack using masked memory operations. With general handling available we can add optimiations for specific cases. Patch is 41.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162308.diff 5 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index d9d6f0bcdcb84..f935442404cde 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1186,6 +1186,12 @@ class SelectionDAG {
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
bool ConstantFold = true);
+ /// Return a vector with the first 'Len' lanes set to true and remaining lanes
+ /// set to false. The mask's ValueType is the same as when comparing vectors
+ /// of type VT.
+ LLVM_ABI SDValue getMaskFromElementCount(const SDLoc &DL, EVT VT,
+ ElementCount Len);
+
/// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 87d5453cd98cf..bcaac40de5459 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6201,8 +6201,33 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
}
- report_fatal_error("Don't know how to widen the result of "
- "EXTRACT_SUBVECTOR for scalable vectors");
+ // Fallback to extracting through memory.
+
+ Align Alignment = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+ SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Alignment);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+ MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+
+ // Write out the input vector.
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, StoreMMO);
+
+ // Build a mask to match the length of the non-widened result.
+ SDValue Mask =
+ DAG.getMaskFromElementCount(dl, WidenVT, VT.getVectorElementCount());
+
+ // Read back the sub-vector setting the remaining lanes to poison.
+ StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, InVT, VT, Idx);
+ return DAG.getMaskedLoad(
+ WidenVT, dl, Ch, StackPtr, DAG.getUNDEF(StackPtr.getValueType()), Mask,
+ DAG.getPOISON(WidenVT), VT, LoadMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
}
// We could try widening the input to the right length but for now, extract
@@ -6306,11 +6331,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
if (VT.isVector()) {
// If all else fails replace the load with a wide masked load.
SDLoc DL(N);
- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
- SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
- SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
- DAG.getConstant(0, DL, IdxVT), Len);
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, WideVT, VT.getVectorElementCount());
SDValue NewLoad = DAG.getMaskedLoad(
WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -7447,9 +7469,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
SDValue InVec = N->getOperand(0);
EVT OrigVT = SubVec.getValueType();
- if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector)
- SubVec = GetWidenedVector(SubVec);
-
+ SubVec = GetWidenedVector(SubVec);
EVT SubVT = SubVec.getValueType();
// Whether or not all the elements of the widened SubVec will be inserted into
@@ -7471,17 +7491,52 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
}
}
+ if (!IndicesValid)
+ report_fatal_error(
+ "Don't know how to widen the operands for INSERT_SUBVECTOR");
+
SDLoc DL(N);
// We need to make sure that the indices are still valid, otherwise we might
// widen what was previously well-defined to something undefined.
- if (IndicesValid && InVec.isUndef() && N->getConstantOperandVal(2) == 0)
+ if (InVec.isUndef() && N->getConstantOperandVal(2) == 0)
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, InVec, SubVec,
N->getOperand(2));
- if (!IndicesValid || OrigVT.isScalableVector())
- report_fatal_error(
- "Don't know how to widen the operands for INSERT_SUBVECTOR");
+ if (OrigVT.isScalableVector()) {
+ // Fallback to inserting through memory.
+
+ Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
+ SDValue StackPtr = DAG.CreateStackTemporary(VT.getStoreSize(), Alignment);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+ MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+
+ // Write out the vector being inserting into.
+ SDValue Ch =
+ DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, StoreMMO);
+
+ // Build a mask to match the length of the sub-vector.
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, SubVT, OrigVT.getVectorElementCount());
+
+ // Overwrite the sub-vector at the required offset.
+ StackPtr =
+ TLI.getVectorSubVecPointer(DAG, StackPtr, VT, OrigVT, N->getOperand(2));
+ Ch = DAG.getMaskedStore(Ch, DL, SubVec, StackPtr,
+ DAG.getUNDEF(StackPtr.getValueType()), Mask, VT,
+ StoreMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
+
+ // Read back the result.
+ return DAG.getLoad(VT, DL, Ch, StackPtr, LoadMMO);
+ }
// If the operands can't be widened legally, just replace the INSERT_SUBVECTOR
// with a series of INSERT_VECTOR_ELT
@@ -7560,12 +7615,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
if (StVT.isVector()) {
// If all else fails replace the store with a wide masked store.
SDLoc DL(N);
- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
SDValue WideStVal = GetWidenedVector(StVal);
- SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
- SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
- DAG.getConstant(0, DL, IdxVT), Len);
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, WideVT, StVT.getVectorElementCount());
return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
ST->getOffset(), Mask, ST->getMemoryVT(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 95f53fe0bfdba..d976c0ce1b901 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2112,6 +2112,14 @@ SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
return getConstant(EC.getKnownMinValue(), DL, VT);
}
+SDValue SelectionDAG::getMaskFromElementCount(const SDLoc &DL, EVT DataVT,
+ ElementCount EC) {
+ EVT IdxVT = TLI->getVectorIdxTy(getDataLayout());
+ EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), DataVT);
+ return getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT,
+ getConstant(0, DL, IdxVT), getElementCount(DL, IdxVT, EC));
+}
+
SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
APInt One(ResVT.getScalarSizeInBits(), 1);
return getStepVector(DL, ResVT, One);
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
index 4aaa25e5e66c5..8d0c71502f1e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -3,12 +3,199 @@
; Extracting illegal subvectors
-define <vscale x 1 x i32> @extract_nxv1i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
-; CHECK-LABEL: extract_nxv1i32_nxv4i32:
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_0:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
- %retval = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
- ret <vscale x 1 x i32> %retval
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 1)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 2)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 3)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 0)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: addpl x9, sp, #4
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 1)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 1)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 2)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 3)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
}
define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) nounwind {
@@ -19,9 +206,6 @@ define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) noun
ret <vscale x 1 x i16> %retval
}
-declare <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32>, i64)
-declare <vscale x 1 x i16> @llvm.vector.extract.nxv1i16.nxv6i16(<vscale x 6 x i16>, i64)
-
;
; Extract half i1 vector that needs promotion from legal type.
;
@@ -43,8 +227,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv16i1_8(<vscale x 16 x i1> %in) {
ret <vscale x 8 x i1> %res
}
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract i1 vector that needs widening from one that needs widening.
;
@@ -99,8 +281,6 @@ define <vscale x 14 x i1> @extract_nxv14i1_nxv28i1_14(<vscale x 28 x i1> %in) uw
ret <vscale x 14 x i1> %res
}
-declare <vscale x 14 x i1> @llvm.vector.extract.nxv14i1.nxv28i1(<vscale x 28 x i1>, i64)
-
;
; Extract half i1 vector that needs promotion from one that needs splitting.
;
@@ -140,8 +320,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv32i1_24(<vscale x 32 x i1> %in) {
ret <vscale x 8 x i1> %res
}
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv32i1(<vscale x 32 x i1>, i64)
-
;
; Extract 1/4th i1 vector that needs promotion from legal type.
;
@@ -185,8 +363,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv16i1_12(<vscale x 16 x i1> %in) {
ret <vscale x 4 x i1> %res
}
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract 1/8th i1 vector that needs promotion from legal type.
;
@@ -278,8 +454,6 @@ define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_14(<vscale x 16 x i1> %in) {
ret <vscale x 2 x i1> %res
}
-declare <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract i1 vector that needs promotion from one that needs widening.
;
@@ -313,8 +487,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv12i1_8(<vscale x 12 x i1> %in) {
ret <vscale x 4 x i1> %res
}
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv12i1(<vscale x 12 x i1>, i64)
-
;
; Extract 1/8th i8 vector that needs promotion from legal type.
;
@@ -406,8 +578,6 @@ define <vscale x 2 x i8> @extract_nxv2i8_nxv16i8_14(<vscale x 16 x i8> %in) {
ret <vscale x 2 x i8> %res
}
-declare <vscale x 2 x i8> @llvm.vector.extract.nxv2i8.nxv16i8(<vscale x 16 x i8>, i64)
-
;
; Extract i8 vector that...
[truncated]
|
@llvm/pr-subscribers-llvm-selectiondag Author: Paul Walker (paulwalker-arm) ChangesThis PR implements catch all handling for widening the scalable subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It does this via the stack using masked memory operations. With general handling available we can add optimiations for specific cases. Patch is 41.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162308.diff 5 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index d9d6f0bcdcb84..f935442404cde 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1186,6 +1186,12 @@ class SelectionDAG {
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
bool ConstantFold = true);
+ /// Return a vector with the first 'Len' lanes set to true and remaining lanes
+ /// set to false. The mask's ValueType is the same as when comparing vectors
+ /// of type VT.
+ LLVM_ABI SDValue getMaskFromElementCount(const SDLoc &DL, EVT VT,
+ ElementCount Len);
+
/// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 87d5453cd98cf..bcaac40de5459 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6201,8 +6201,33 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
}
- report_fatal_error("Don't know how to widen the result of "
- "EXTRACT_SUBVECTOR for scalable vectors");
+ // Fallback to extracting through memory.
+
+ Align Alignment = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+ SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Alignment);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+ MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+
+ // Write out the input vector.
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, StoreMMO);
+
+ // Build a mask to match the length of the non-widened result.
+ SDValue Mask =
+ DAG.getMaskFromElementCount(dl, WidenVT, VT.getVectorElementCount());
+
+ // Read back the sub-vector setting the remaining lanes to poison.
+ StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, InVT, VT, Idx);
+ return DAG.getMaskedLoad(
+ WidenVT, dl, Ch, StackPtr, DAG.getUNDEF(StackPtr.getValueType()), Mask,
+ DAG.getPOISON(WidenVT), VT, LoadMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
}
// We could try widening the input to the right length but for now, extract
@@ -6306,11 +6331,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
if (VT.isVector()) {
// If all else fails replace the load with a wide masked load.
SDLoc DL(N);
- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
- SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
- SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
- DAG.getConstant(0, DL, IdxVT), Len);
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, WideVT, VT.getVectorElementCount());
SDValue NewLoad = DAG.getMaskedLoad(
WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -7447,9 +7469,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
SDValue InVec = N->getOperand(0);
EVT OrigVT = SubVec.getValueType();
- if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector)
- SubVec = GetWidenedVector(SubVec);
-
+ SubVec = GetWidenedVector(SubVec);
EVT SubVT = SubVec.getValueType();
// Whether or not all the elements of the widened SubVec will be inserted into
@@ -7471,17 +7491,52 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
}
}
+ if (!IndicesValid)
+ report_fatal_error(
+ "Don't know how to widen the operands for INSERT_SUBVECTOR");
+
SDLoc DL(N);
// We need to make sure that the indices are still valid, otherwise we might
// widen what was previously well-defined to something undefined.
- if (IndicesValid && InVec.isUndef() && N->getConstantOperandVal(2) == 0)
+ if (InVec.isUndef() && N->getConstantOperandVal(2) == 0)
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, InVec, SubVec,
N->getOperand(2));
- if (!IndicesValid || OrigVT.isScalableVector())
- report_fatal_error(
- "Don't know how to widen the operands for INSERT_SUBVECTOR");
+ if (OrigVT.isScalableVector()) {
+ // Fallback to inserting through memory.
+
+ Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
+ SDValue StackPtr = DAG.CreateStackTemporary(VT.getStoreSize(), Alignment);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+ MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+
+ // Write out the vector being inserting into.
+ SDValue Ch =
+ DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, StoreMMO);
+
+ // Build a mask to match the length of the sub-vector.
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, SubVT, OrigVT.getVectorElementCount());
+
+ // Overwrite the sub-vector at the required offset.
+ StackPtr =
+ TLI.getVectorSubVecPointer(DAG, StackPtr, VT, OrigVT, N->getOperand(2));
+ Ch = DAG.getMaskedStore(Ch, DL, SubVec, StackPtr,
+ DAG.getUNDEF(StackPtr.getValueType()), Mask, VT,
+ StoreMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
+
+ // Read back the result.
+ return DAG.getLoad(VT, DL, Ch, StackPtr, LoadMMO);
+ }
// If the operands can't be widened legally, just replace the INSERT_SUBVECTOR
// with a series of INSERT_VECTOR_ELT
@@ -7560,12 +7615,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
if (StVT.isVector()) {
// If all else fails replace the store with a wide masked store.
SDLoc DL(N);
- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
SDValue WideStVal = GetWidenedVector(StVal);
- SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
- SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
- DAG.getConstant(0, DL, IdxVT), Len);
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, WideVT, StVT.getVectorElementCount());
return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
ST->getOffset(), Mask, ST->getMemoryVT(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 95f53fe0bfdba..d976c0ce1b901 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2112,6 +2112,14 @@ SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
return getConstant(EC.getKnownMinValue(), DL, VT);
}
+SDValue SelectionDAG::getMaskFromElementCount(const SDLoc &DL, EVT DataVT,
+ ElementCount EC) {
+ EVT IdxVT = TLI->getVectorIdxTy(getDataLayout());
+ EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), DataVT);
+ return getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT,
+ getConstant(0, DL, IdxVT), getElementCount(DL, IdxVT, EC));
+}
+
SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
APInt One(ResVT.getScalarSizeInBits(), 1);
return getStepVector(DL, ResVT, One);
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
index 4aaa25e5e66c5..8d0c71502f1e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -3,12 +3,199 @@
; Extracting illegal subvectors
-define <vscale x 1 x i32> @extract_nxv1i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
-; CHECK-LABEL: extract_nxv1i32_nxv4i32:
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_0:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
- %retval = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
- ret <vscale x 1 x i32> %retval
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 1)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 2)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 3)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 0)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: addpl x9, sp, #4
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 1)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 1)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 2)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 3)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
}
define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) nounwind {
@@ -19,9 +206,6 @@ define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) noun
ret <vscale x 1 x i16> %retval
}
-declare <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32>, i64)
-declare <vscale x 1 x i16> @llvm.vector.extract.nxv1i16.nxv6i16(<vscale x 6 x i16>, i64)
-
;
; Extract half i1 vector that needs promotion from legal type.
;
@@ -43,8 +227,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv16i1_8(<vscale x 16 x i1> %in) {
ret <vscale x 8 x i1> %res
}
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract i1 vector that needs widening from one that needs widening.
;
@@ -99,8 +281,6 @@ define <vscale x 14 x i1> @extract_nxv14i1_nxv28i1_14(<vscale x 28 x i1> %in) uw
ret <vscale x 14 x i1> %res
}
-declare <vscale x 14 x i1> @llvm.vector.extract.nxv14i1.nxv28i1(<vscale x 28 x i1>, i64)
-
;
; Extract half i1 vector that needs promotion from one that needs splitting.
;
@@ -140,8 +320,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv32i1_24(<vscale x 32 x i1> %in) {
ret <vscale x 8 x i1> %res
}
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv32i1(<vscale x 32 x i1>, i64)
-
;
; Extract 1/4th i1 vector that needs promotion from legal type.
;
@@ -185,8 +363,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv16i1_12(<vscale x 16 x i1> %in) {
ret <vscale x 4 x i1> %res
}
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract 1/8th i1 vector that needs promotion from legal type.
;
@@ -278,8 +454,6 @@ define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_14(<vscale x 16 x i1> %in) {
ret <vscale x 2 x i1> %res
}
-declare <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract i1 vector that needs promotion from one that needs widening.
;
@@ -313,8 +487,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv12i1_8(<vscale x 12 x i1> %in) {
ret <vscale x 4 x i1> %res
}
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv12i1(<vscale x 12 x i1>, i64)
-
;
; Extract 1/8th i8 vector that needs promotion from legal type.
;
@@ -406,8 +578,6 @@ define <vscale x 2 x i8> @extract_nxv2i8_nxv16i8_14(<vscale x 16 x i8> %in) {
ret <vscale x 2 x i8> %res
}
-declare <vscale x 2 x i8> @llvm.vector.extract.nxv2i8.nxv16i8(<vscale x 16 x i8>, i64)
-
;
; Extract i8 vector that...
[truncated]
|
declare <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32>, i64) | ||
declare <vscale x 1 x i16> @llvm.vector.extract.nxv1i16.nxv6i16(<vscale x 6 x i16>, i64) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need to declare intrinsics anymore so rather than add more I figured I'd just remove the existing ones. I'm happy to re-add them if reviews prefer to keep them.
if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector) | ||
SubVec = GetWidenedVector(SubVec); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change is not necessary but I don't see any way the if statement can be false because at this point the result (and thus first operand) must be legal.
This PR implements catch all handling for widening the scalable subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It does this via the stack using masked memory operations. With general handling available we can add optimiations for specific cases.