-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64] Improve lowering for scalable masked interleaving stores #156718
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: David Sherwood (david-arm) ChangesSimilar to #154338, this PR aims to support lowering of certain IR %mask = .. @llvm.vector.interleave2(<vscale x 16 x i1> %m, <vscale x 16 x i1> %m) where we're interleaving both the value and the mask being passed This PR adds a DAG combine for lowering this kind of IR pattern Patch is 42.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156718.diff 4 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 4248b0144ef18..f807f8f8fe1eb 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -3346,6 +3346,14 @@ namespace ISD {
Ld->getAddressingMode() == ISD::UNINDEXED;
}
+ /// Returns true if the specified node is a non-extending and unindexed
+ /// masked store.
+ inline bool isNormalMaskedStore(const SDNode *N) {
+ auto *St = dyn_cast<MaskedStoreSDNode>(N);
+ return St && !St->isTruncatingStore() &&
+ St->getAddressingMode() == ISD::UNINDEXED;
+ }
+
/// Attempt to match a unary predicate against a scalar/splat constant or
/// every element of a constant BUILD_VECTOR.
/// If AllowUndef is true, then UNDEF elements will pass nullptr to Match.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b72a05223160f..9f5f37686f2aa 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24632,6 +24632,106 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
}
+static bool
+isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+ if (N->getOpcode() != ISD::CONCAT_VECTORS)
+ return false;
+
+ unsigned NumParts = N->getNumOperands();
+
+ // We should be concatenating each sequential result from a
+ // VECTOR_INTERLEAVE.
+ SDNode *InterleaveOp = N->getOperand(0).getNode();
+ if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
+ InterleaveOp->getNumOperands() != NumParts)
+ return false;
+
+ for (unsigned I = 0; I < NumParts; I++) {
+ if (N->getOperand(I) != SDValue(InterleaveOp, I))
+ return false;
+ }
+
+ Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
+ return true;
+}
+
+static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
+ SDValue WideMask,
+ unsigned RequiredNumParts) {
+ if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
+ SmallVector<SDValue, 4> MaskInterleaveOps;
+ if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
+ MaskInterleaveOps))
+ return SDValue();
+
+ if (MaskInterleaveOps.size() != RequiredNumParts)
+ return SDValue();
+
+ // Make sure the inputs to the vector interleave are identical.
+ if (!llvm::all_equal(MaskInterleaveOps))
+ return SDValue();
+
+ return MaskInterleaveOps[0];
+ } else if (WideMask->getOpcode() == ISD::SPLAT_VECTOR) {
+ ElementCount EC = WideMask.getValueType().getVectorElementCount();
+ assert(EC.isKnownMultipleOf(RequiredNumParts) &&
+ "Expected element count divisible by number of parts");
+ EC = EC.divideCoefficientBy(RequiredNumParts);
+ return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
+ WideMask->getOperand(0));
+ }
+ return SDValue();
+}
+
+static SDValue
+performStoreInterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+ SDValue WideValue = MST->getValue();
+
+ // Bail out if the stored value has an unexpected number of uses, since we'll
+ // have to peform manual interleaving and may as well just use normal masked
+ // stores. Also, discard masked stores that are truncating or indexed.
+ if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
+ !MST->getOffset().isUndef())
+ return SDValue();
+
+ SmallVector<SDValue, 4> ValueInterleaveOps;
+ if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
+ ValueInterleaveOps))
+ return SDValue();
+
+ unsigned NumParts = ValueInterleaveOps.size();
+ if (NumParts != 2 && NumParts != 4)
+ return SDValue();
+
+ // At the moment we're unlikely to see a fixed-width vector deinterleave as
+ // we usually generate shuffles instead.
+ EVT SubVecTy = ValueInterleaveOps[0].getValueType();
+ if (!SubVecTy.isScalableVT() ||
+ SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue NarrowMask =
+ getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
+ if (!NarrowMask)
+ return SDValue();
+
+ const Intrinsic::ID IID =
+ NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
+ SDValue Res;
+ SmallVector<SDValue, 8> NewStOps;
+ NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
+ NewStOps.append(ValueInterleaveOps);
+ NewStOps.append({NarrowMask, MST->getBasePtr()});
+ return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
+}
+
static SDValue performMSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -24641,6 +24741,9 @@ static SDValue performMSTORECombine(SDNode *N,
SDValue Mask = MST->getMask();
SDLoc DL(N);
+ if (SDValue Res = performStoreInterleaveCombine(N, DCI, DAG))
+ return Res;
+
// If this is a UZP1 followed by a masked store, fold this into a masked
// truncating store. We can do this even if this is already a masked
// truncstore.
@@ -27254,43 +27357,11 @@ static SDValue performVectorDeinterleaveCombine(
return SDValue();
// Now prove that the mask is an interleave of identical masks.
- SDValue Mask = MaskedLoad->getMask();
- if (Mask->getOpcode() != ISD::SPLAT_VECTOR &&
- Mask->getOpcode() != ISD::CONCAT_VECTORS)
- return SDValue();
-
- SDValue NarrowMask;
SDLoc DL(N);
- if (Mask->getOpcode() == ISD::CONCAT_VECTORS) {
- if (Mask->getNumOperands() != NumParts)
- return SDValue();
-
- // We should be concatenating each sequential result from a
- // VECTOR_INTERLEAVE.
- SDNode *InterleaveOp = Mask->getOperand(0).getNode();
- if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
- InterleaveOp->getNumOperands() != NumParts)
- return SDValue();
-
- for (unsigned I = 0; I < NumParts; I++) {
- if (Mask.getOperand(I) != SDValue(InterleaveOp, I))
- return SDValue();
- }
-
- // Make sure the inputs to the vector interleave are identical.
- if (!llvm::all_equal(InterleaveOp->op_values()))
- return SDValue();
-
- NarrowMask = InterleaveOp->getOperand(0);
- } else { // ISD::SPLAT_VECTOR
- ElementCount EC = Mask.getValueType().getVectorElementCount();
- assert(EC.isKnownMultipleOf(NumParts) &&
- "Expected element count divisible by number of parts");
- EC = EC.divideCoefficientBy(NumParts);
- NarrowMask =
- DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
- Mask->getOperand(0));
- }
+ SDValue NarrowMask =
+ getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
+ if (!NarrowMask)
+ return SDValue();
const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
: Intrinsic::aarch64_sve_ld4_sret;
diff --git a/llvm/test/CodeGen/AArch64/fixed_masked_interleaved_stores.ll b/llvm/test/CodeGen/AArch64/fixed_masked_interleaved_stores.ll
new file mode 100644
index 0000000000000..4b2df12e6202d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fixed_masked_interleaved_stores.ll
@@ -0,0 +1,455 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define void @foo_st2_v16i8(<16 x i1> %mask, <16 x i8> %val1, <16 x i8> %val2, ptr %p) {
+; CHECK-LABEL: foo_st2_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip2 v3.16b, v0.16b, v0.16b
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: shl v3.16b, v3.16b, #7
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: cmlt v3.16b, v3.16b, #0
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT: and v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v4.16b
+; CHECK-NEXT: ext v4.16b, v3.16b, v3.16b, #8
+; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: zip1 v3.16b, v3.16b, v4.16b
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v5.16b
+; CHECK-NEXT: addv h3, v3.8h
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w9, s3
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: zip1 v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: bfi w8, w9, #16, #16
+; CHECK-NEXT: tbnz w8, #0, .LBB0_33
+; CHECK-NEXT: // %bb.1: // %else
+; CHECK-NEXT: tbnz w8, #1, .LBB0_34
+; CHECK-NEXT: .LBB0_2: // %else2
+; CHECK-NEXT: tbnz w8, #2, .LBB0_35
+; CHECK-NEXT: .LBB0_3: // %else4
+; CHECK-NEXT: tbnz w8, #3, .LBB0_36
+; CHECK-NEXT: .LBB0_4: // %else6
+; CHECK-NEXT: tbnz w8, #4, .LBB0_37
+; CHECK-NEXT: .LBB0_5: // %else8
+; CHECK-NEXT: tbnz w8, #5, .LBB0_38
+; CHECK-NEXT: .LBB0_6: // %else10
+; CHECK-NEXT: tbnz w8, #6, .LBB0_39
+; CHECK-NEXT: .LBB0_7: // %else12
+; CHECK-NEXT: tbnz w8, #7, .LBB0_40
+; CHECK-NEXT: .LBB0_8: // %else14
+; CHECK-NEXT: tbnz w8, #8, .LBB0_41
+; CHECK-NEXT: .LBB0_9: // %else16
+; CHECK-NEXT: tbnz w8, #9, .LBB0_42
+; CHECK-NEXT: .LBB0_10: // %else18
+; CHECK-NEXT: tbnz w8, #10, .LBB0_43
+; CHECK-NEXT: .LBB0_11: // %else20
+; CHECK-NEXT: tbnz w8, #11, .LBB0_44
+; CHECK-NEXT: .LBB0_12: // %else22
+; CHECK-NEXT: tbnz w8, #12, .LBB0_45
+; CHECK-NEXT: .LBB0_13: // %else24
+; CHECK-NEXT: tbnz w8, #13, .LBB0_46
+; CHECK-NEXT: .LBB0_14: // %else26
+; CHECK-NEXT: tbnz w8, #14, .LBB0_47
+; CHECK-NEXT: .LBB0_15: // %else28
+; CHECK-NEXT: tbnz w8, #15, .LBB0_48
+; CHECK-NEXT: .LBB0_16: // %else30
+; CHECK-NEXT: zip2 v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: tbnz w8, #16, .LBB0_49
+; CHECK-NEXT: .LBB0_17: // %else32
+; CHECK-NEXT: tbnz w8, #17, .LBB0_50
+; CHECK-NEXT: .LBB0_18: // %else34
+; CHECK-NEXT: tbnz w8, #18, .LBB0_51
+; CHECK-NEXT: .LBB0_19: // %else36
+; CHECK-NEXT: tbnz w8, #19, .LBB0_52
+; CHECK-NEXT: .LBB0_20: // %else38
+; CHECK-NEXT: tbnz w8, #20, .LBB0_53
+; CHECK-NEXT: .LBB0_21: // %else40
+; CHECK-NEXT: tbnz w8, #21, .LBB0_54
+; CHECK-NEXT: .LBB0_22: // %else42
+; CHECK-NEXT: tbnz w8, #22, .LBB0_55
+; CHECK-NEXT: .LBB0_23: // %else44
+; CHECK-NEXT: tbnz w8, #23, .LBB0_56
+; CHECK-NEXT: .LBB0_24: // %else46
+; CHECK-NEXT: tbnz w8, #24, .LBB0_57
+; CHECK-NEXT: .LBB0_25: // %else48
+; CHECK-NEXT: tbnz w8, #25, .LBB0_58
+; CHECK-NEXT: .LBB0_26: // %else50
+; CHECK-NEXT: tbnz w8, #26, .LBB0_59
+; CHECK-NEXT: .LBB0_27: // %else52
+; CHECK-NEXT: tbnz w8, #27, .LBB0_60
+; CHECK-NEXT: .LBB0_28: // %else54
+; CHECK-NEXT: tbnz w8, #28, .LBB0_61
+; CHECK-NEXT: .LBB0_29: // %else56
+; CHECK-NEXT: tbnz w8, #29, .LBB0_62
+; CHECK-NEXT: .LBB0_30: // %else58
+; CHECK-NEXT: tbnz w8, #30, .LBB0_63
+; CHECK-NEXT: .LBB0_31: // %else60
+; CHECK-NEXT: tbnz w8, #31, .LBB0_64
+; CHECK-NEXT: .LBB0_32: // %else62
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB0_33: // %cond.store
+; CHECK-NEXT: str b0, [x0]
+; CHECK-NEXT: tbz w8, #1, .LBB0_2
+; CHECK-NEXT: .LBB0_34: // %cond.store1
+; CHECK-NEXT: mov b3, v0.b[1]
+; CHECK-NEXT: stur b3, [x0, #1]
+; CHECK-NEXT: tbz w8, #2, .LBB0_3
+; CHECK-NEXT: .LBB0_35: // %cond.store3
+; CHECK-NEXT: mov b3, v0.b[2]
+; CHECK-NEXT: stur b3, [x0, #2]
+; CHECK-NEXT: tbz w8, #3, .LBB0_4
+; CHECK-NEXT: .LBB0_36: // %cond.store5
+; CHECK-NEXT: mov b3, v0.b[3]
+; CHECK-NEXT: stur b3, [x0, #3]
+; CHECK-NEXT: tbz w8, #4, .LBB0_5
+; CHECK-NEXT: .LBB0_37: // %cond.store7
+; CHECK-NEXT: mov b3, v0.b[4]
+; CHECK-NEXT: stur b3, [x0, #4]
+; CHECK-NEXT: tbz w8, #5, .LBB0_6
+; CHECK-NEXT: .LBB0_38: // %cond.store9
+; CHECK-NEXT: mov b3, v0.b[5]
+; CHECK-NEXT: stur b3, [x0, #5]
+; CHECK-NEXT: tbz w8, #6, .LBB0_7
+; CHECK-NEXT: .LBB0_39: // %cond.store11
+; CHECK-NEXT: mov b3, v0.b[6]
+; CHECK-NEXT: stur b3, [x0, #6]
+; CHECK-NEXT: tbz w8, #7, .LBB0_8
+; CHECK-NEXT: .LBB0_40: // %cond.store13
+; CHECK-NEXT: mov b3, v0.b[7]
+; CHECK-NEXT: stur b3, [x0, #7]
+; CHECK-NEXT: tbz w8, #8, .LBB0_9
+; CHECK-NEXT: .LBB0_41: // %cond.store15
+; CHECK-NEXT: mov b3, v0.b[8]
+; CHECK-NEXT: stur b3, [x0, #8]
+; CHECK-NEXT: tbz w8, #9, .LBB0_10
+; CHECK-NEXT: .LBB0_42: // %cond.store17
+; CHECK-NEXT: mov b3, v0.b[9]
+; CHECK-NEXT: stur b3, [x0, #9]
+; CHECK-NEXT: tbz w8, #10, .LBB0_11
+; CHECK-NEXT: .LBB0_43: // %cond.store19
+; CHECK-NEXT: mov b3, v0.b[10]
+; CHECK-NEXT: stur b3, [x0, #10]
+; CHECK-NEXT: tbz w8, #11, .LBB0_12
+; CHECK-NEXT: .LBB0_44: // %cond.store21
+; CHECK-NEXT: mov b3, v0.b[11]
+; CHECK-NEXT: stur b3, [x0, #11]
+; CHECK-NEXT: tbz w8, #12, .LBB0_13
+; CHECK-NEXT: .LBB0_45: // %cond.store23
+; CHECK-NEXT: mov b3, v0.b[12]
+; CHECK-NEXT: stur b3, [x0, #12]
+; CHECK-NEXT: tbz w8, #13, .LBB0_14
+; CHECK-NEXT: .LBB0_46: // %cond.store25
+; CHECK-NEXT: mov b3, v0.b[13]
+; CHECK-NEXT: stur b3, [x0, #13]
+; CHECK-NEXT: tbz w8, #14, .LBB0_15
+; CHECK-NEXT: .LBB0_47: // %cond.store27
+; CHECK-NEXT: mov b3, v0.b[14]
+; CHECK-NEXT: stur b3, [x0, #14]
+; CHECK-NEXT: tbz w8, #15, .LBB0_16
+; CHECK-NEXT: .LBB0_48: // %cond.store29
+; CHECK-NEXT: mov b0, v0.b[15]
+; CHECK-NEXT: stur b0, [x0, #15]
+; CHECK-NEXT: zip2 v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: tbz w8, #16, .LBB0_17
+; CHECK-NEXT: .LBB0_49: // %cond.store31
+; CHECK-NEXT: stur b0, [x0, #16]
+; CHECK-NEXT: tbz w8, #17, .LBB0_18
+; CHECK-NEXT: .LBB0_50: // %cond.store33
+; CHECK-NEXT: mov b1, v0.b[1]
+; CHECK-NEXT: stur b1, [x0, #17]
+; CHECK-NEXT: tbz w8, #18, .LBB0_19
+; CHECK-NEXT: .LBB0_51: // %cond.store35
+; CHECK-NEXT: mov b1, v0.b[2]
+; CHECK-NEXT: stur b1, [x0, #18]
+; CHECK-NEXT: tbz w8, #19, .LBB0_20
+; CHECK-NEXT: .LBB0_52: // %cond.store37
+; CHECK-NEXT: mov b1, v0.b[3]
+; CHECK-NEXT: stur b1, [x0, #19]
+; CHECK-NEXT: tbz w8, #20, .LBB0_21
+; CHECK-NEXT: .LBB0_53: // %cond.store39
+; CHECK-NEXT: mov b1, v0.b[4]
+; CHECK-NEXT: stur b1, [x0, #20]
+; CHECK-NEXT: tbz w8, #21, .LBB0_22
+; CHECK-NEXT: .LBB0_54: // %cond.store41
+; CHECK-NEXT: mov b1, v0.b[5]
+; CHECK-NEXT: stur b1, [x0, #21]
+; CHECK-NEXT: tbz w8, #22, .LBB0_23
+; CHECK-NEXT: .LBB0_55: // %cond.store43
+; CHECK-NEXT: mov b1, v0.b[6]
+; CHECK-NEXT: stur b1, [x0, #22]
+; CHECK-NEXT: tbz w8, #23, .LBB0_24
+; CHECK-NEXT: .LBB0_56: // %cond.store45
+; CHECK-NEXT: mov b1, v0.b[7]
+; CHECK-NEXT: stur b1, [x0, #23]
+; CHECK-NEXT: tbz w8, #24, .LBB0_25
+; CHECK-NEXT: .LBB0_57: // %cond.store47
+; CHECK-NEXT: mov b1, v0.b[8]
+; CHECK-NEXT: stur b1, [x0, #24]
+; CHECK-NEXT: tbz w8, #25, .LBB0_26
+; CHECK-NEXT: .LBB0_58: // %cond.store49
+; CHECK-NEXT: mov b1, v0.b[9]
+; CHECK-NEXT: stur b1, [x0, #25]
+; CHECK-NEXT: tbz w8, #26, .LBB0_27
+; CHECK-NEXT: .LBB0_59: // %cond.store51
+; CHECK-NEXT: mov b1, v0.b[10]
+; CHECK-NEXT: stur b1, [x0, #26]
+; CHECK-NEXT: tbz w8, #27, .LBB0_28
+; CHECK-NEXT: .LBB0_60: // %cond.store53
+; CHECK-NEXT: mov b1, v0.b[11]
+; CHECK-NEXT: stur b1, [x0, #27]
+; CHECK-NEXT: tbz w8, #28, .LBB0_29
+; CHECK-NEXT: .LBB0_61: // %cond.store55
+; CHECK-NEXT: mov b1, v0.b[12]
+; CHECK-NEXT: stur b1, [x0, #28]
+; CHECK-NEXT: tbz w8, #29, .LBB0_30
+; CHECK-NEXT: .LBB0_62: // %cond.store57
+; CHECK-NEXT: mov b1, v0.b[13]
+; CHECK-NEXT: stur b1, [x0, #29]
+; CHECK-NEXT: tbz w8, #30, .LBB0_31
+; CHECK-NEXT: .LBB0_63: // %cond.store59
+; CHECK-NEXT: mov b1, v0.b[14]
+; CHECK-NEXT: stur b1, [x0, #30]
+; CHECK-NEXT: tbz w8, #31, .LBB0_32
+; CHECK-NEXT: .LBB0_64: // %cond.store61
+; CHECK-NEXT: mov b0, v0.b[15]
+; CHECK-NEXT: stur b0, [x0, #31]
+; CHECK-NEXT: ret
+ %interleaved.mask = tail call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %mask, <16 x i1> %mask)
+ %strided.vec = tail call <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8> %val1, <16 x i8> %val2)
+ tail call void @llvm.masked.store.v32i8.p0(<32 x i8> %strided.vec, ptr %p, i32 1, <32 x i1> %interleaved.mask)
+ ret void
+}
+
+define void @foo_st2_v8i16(<8 x i1> %mask, <8 x i16> %val1, <8 x i16> %val2, ptr %p) {
+; CHECK-LABEL: foo_st2_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: adrp x8, .LCPI1_0
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT: shl v0.16b, v0.16b, #7
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: addv h3, v0.8h
+; CHECK-NEXT: zip1 v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: fmov w8, s3
+; CHECK-NEXT: tbnz w8, #0, .LBB1_17
+; CHECK-NEXT: // %bb.1: // %else
+; CHECK-NEXT: tbnz w8, #1, .LBB1_18
+; CHECK-NEXT: .LBB1_2: // %else2
+; CHECK-NEXT: tbnz w8, #2, .LBB1_19
+; CHECK-NEXT: .LBB1_3: // %else4
+; CHECK-NEXT: tbnz w8, #3, .LBB1_20
+; CHECK-NEXT: .LBB1_4: // %else6
+; CHECK-NEXT: tbnz w8, #4, .LBB1_21
+; CHECK-NEXT: .LBB1_5: // %else8
+; CHECK-NEXT: tbnz w8, #5, .LBB1_22
+; CHECK-NEXT: .LBB1_6: // %else10
+; CHECK-NEXT: tbnz w8, #6, .LBB1_23
+; CHECK-NEXT: .LBB1_7: // %else12
+; CHECK-NEXT: tbnz w8, #7, .LBB1_24
+; CHECK-NEXT: .LBB1_8: // %else14
+; CHECK-NEXT: zip2 v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: tbnz w8, #8, .LBB1_25
+; CHECK-NEXT: .LBB1_9: // %else16
+; CHECK-NEXT: tbnz w8, #9, .LBB1_26
+; CHECK-NEXT: .LBB1_10: // %else18
+; CHECK-NEXT: tbnz w8, #10, .LBB1_27
+; CHECK-NEXT: .LBB1_11: // %else20
+; CHECK-NEXT: tbnz w8, #11, .LBB1_28
+; CHECK-NEXT: .LBB1_12: // %else22
+; CHECK-NEXT: tbnz w8, #12, .LBB1_29
+; CHECK-NEXT: .LBB1_13: // %else24
+; CHECK-NEXT: tbnz w8, #13, .LBB1_30
+; CHECK-NEXT: .LBB1_14: // %else26
+; CHECK-NEXT: tbnz w8, #14, .LBB1_31
+; CHECK-NEXT: .LBB1_15: // %else28
+; CHECK-NEXT: tbnz w8, #15, .LBB1_32
+; CHECK-NEXT: .LBB1_16: // %else30
+; CHECK-NEXT: ret
+; CHECK-NEXT: .LBB1_17: // %cond.store
+; CHECK-NEXT: str h0, [x0]
+; CHECK-NEXT: tbz w8, #1, .LBB1_2
+; CHECK-NEXT: .LBB1_18: // %cond.store1
+; CHECK-NEXT: mov h3, v0.h[1]
+; CHECK-NEXT: str h3, [x0, #2]
+; CHECK-NEXT: tbz w8, #2, .LBB1_3
+; CHECK-NEXT: .LBB1_19: // %cond.store3
+; CHECK-NEXT: mov h3, v0.h[2]
+; CHECK-NEXT: str h3, [x0, #4]
+; CHECK-NEXT: tbz w8, #3, .LBB1_4
+; CHECK-NEXT: .LBB1_20: // %cond.store5
+; CHECK-NEXT: mov h3, v0.h[3]
+; CHECK-NEXT: str h3, [x0, #6]
+; CHECK-NEXT: tbz w8, #4, .LBB1_5
+; CHECK-NEXT: .LBB1_21: // %cond.store7
+; CHECK-NEXT: mov h3, v0.h[4]
+; CHECK-NEXT: str h3, [x0, #8]
+; CHECK-NEXT: tbz w8, #5, .LBB1_6
+; CHECK-NEXT: .LBB1_22: // %cond.store9
+; CHECK-NEXT: mov h3, v0.h[5]
+; CHECK-NEXT: str h3, [x0, #10]
+; CHECK-NEXT: tbz w8, #6, .LBB1_7
+; CHECK-NEXT: .LBB1_23: // %cond.store11
+; CHECK-NEXT: mov h3, v0.h[6]
+; CHECK-NEXT: str h3, [x0, #12]
+; CHECK-NEXT: tbz w8, #7, .LBB1_8
+; CHECK-NEXT: .LBB1_24: // %cond.store13
+; CHECK-NEXT: mov h0, v0.h[7]
+; CHECK-NEXT: str h0, [x0, #14]
+; CHECK-NEXT: zip2 v0.8h, v1.8h, v2.8h
+; CHECK-NEXT: tbz w8, #8, .LBB1_9
+; CHECK-NEXT: .LBB1_25: // %cond.store15
+; CHECK-NEXT: str h0, [x0, #16]
+; CHECK-NEXT: tbz w8, #9, .LBB1_10
+; CHECK-NEXT: .LBB1_26: // %cond.store17
+; CHECK-NEXT: mov h1, v0.h[1]
+; CHECK-NEXT: str h1, [x0, #18]
+; CHECK-NEXT: tbz w8, #10, .LBB1_11
+; CHECK-NEXT: .LBB1_27: // %cond.store19
+; CHECK-NEXT: mov h1, v0.h[2]
+; CHECK-NEXT: str h1, [x0, #20]
+; CHECK-NEXT: tbz w8, #11, .LBB1_12
+; CHECK-NEXT: .LBB1_28: // %cond.store21
+; CHECK-NEXT: mov h1, v0.h[3]
+; CHECK-NEXT: str h1, [x0, #22]
+; CHECK-NEXT: tbz w8, #12, .LBB1_13
+; CHECK-NEXT: .LBB1_29: // %cond.store23
+; CHECK-NEXT: mov h1, v0.h[4]
+; CHECK-NEXT: ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Left some comments & suggestions but mostly minor, otherwise LGTM cheers!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tail
can be dropped from calls?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
; Number of parts in mask interleave must match deinterleave. | |
; Number of parts in mask interleave must match interleave. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add -asm-verbose=0
to get rid of all this MI metadata?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if things have changed but historically adding this option broken the auto-update script?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if things have changed but historically adding this option broken the auto-update script?
ah ok wasn't aware of that. I just tried it on this test and the changes looked sensible at least
Similar to llvm#154338, this PR aims to support lowering of certain IR to SVE's st2 and st4 instructions. The typical IR scenario looks like: %mask = .. @llvm.vector.interleave2(<vscale x 16 x i1> %m, <vscale x 16 x i1> %m) %val = .. @llvm.vector.interleave2(<vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2) .. @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %val, ..., <vscale x 32 x i1> %mask) where we're interleaving both the value and the mask being passed to the wide store. When the mask interleave parts are identical we can lower this to st2b. This PR adds a DAG combine for lowering this kind of IR pattern to st2X and st4X SVE instructions.
4cce191
to
45807c8
Compare
for (unsigned I = 0; I < NumParts; I++) { | ||
if (N->getOperand(I) != SDValue(InterleaveOp, I)) | ||
return false; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for (unsigned I = 0; I < NumParts; I++) { | |
if (N->getOperand(I) != SDValue(InterleaveOp, I)) | |
return false; | |
} | |
for (unsigned I = 0; I < NumParts; I++) | |
if (N->getOperand(I) != SDValue(InterleaveOp, I)) | |
return false; |
if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) || | ||
!MST->getOffset().isUndef()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth adding "!MST->isSimple() ||"?
Rebased and tested this downstream, seems fine. |
Similar to #154338, this PR aims to support lowering of certain IR
to SVE's st2 and st4 instructions. The typical IR scenario looks
like:
%mask = .. @llvm.vector.interleave2(<vscale x 16 x i1> %m, <vscale x 16 x i1> %m)
%val = .. @llvm.vector.interleave2(<vscale x 16 x i8> %v1, <vscale x 16 x i8> %v2)
.. @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> %val, ..., <vscale x 32 x i1> %mask)
where we're interleaving both the value and the mask being passed
to the wide store. When the mask interleave parts are identical
we can lower this to st2b.
This PR adds a DAG combine for lowering this kind of IR pattern
to st2X and st4X SVE instructions.