-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AArch64] Scalarize extracted vector loads. #159714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Scalarize extracted vector loads. #159714
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesGiven a vector load that is only extracted from, it is more efficient to perform the individual loads than a single load and many extracts. This adds a late optimization for scalarizing extracted vector loads that do not have any other uses and will not be more efficiently kept in scalar registers. Patch is 68.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/159714.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cd7f0e719ad0c..175874febf321 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20467,6 +20467,81 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
}
}
+ // Given an extract(load) or extract(extend(load)), produce a scalar load
+ // instead to avoid the cross-register-bank copies.
+ if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
+ VT.isInteger() && isa<ConstantSDNode>(N1) &&
+ !N0.getValueType().isScalableVector()) {
+ SDValue LoadN0 = N0;
+ // Look through sext/zext and extract_subvector / insert_subvector if
+ // required.
+ if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
+ N0.getOpcode() == ISD::SIGN_EXTEND ||
+ N0.getOpcode() == ISD::ANY_EXTEND) &&
+ N0.getOperand(0).hasOneUse())
+ LoadN0 = N0.getOperand(0);
+ unsigned OffsetElts = 0;
+ if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ !LoadN0.getOperand(0).getValueType().isScalableVector()) {
+ OffsetElts = LoadN0.getConstantOperandVal(1);
+ LoadN0 = LoadN0.getOperand(0);
+ }
+ if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ LoadN0.getOperand(0).isUndef() &&
+ isNullConstant(LoadN0.getOperand(2)) && LoadN0.getOperand(1).hasOneUse())
+ LoadN0 = LoadN0.getOperand(1);
+
+ // Check all the uses are valid and can be scalarized. We check that all the
+ // uses are extracts and those extracts are not re-inserted into an
+ // operation best treated as a vector register.
+ auto Load = dyn_cast<LoadSDNode>(LoadN0);
+ if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
+ Load->getMemoryVT().isByteSized() &&
+ all_of(N0->uses(), [&](const SDUse &U) {
+ return U.getResNo() != N0.getResNo() ||
+ (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ !any_of(U.getUser()->uses(), [](const SDUse &U2) {
+ return U2.getUser()->getOpcode() ==
+ ISD::INSERT_VECTOR_ELT ||
+ U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
+ U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
+ }));
+ })) {
+
+ SDLoc DL(Load);
+ EVT ScalarVT = Load->getValueType(0).getScalarType();
+ if (ScalarVT.getSizeInBits() < 32)
+ ScalarVT = MVT::i32;
+
+ // Generate a new scalar load.
+ unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
+ Load->getValueType(0).getScalarSizeInBits() / 8;
+ SDValue BasePtr = DAG.getObjectPtrOffset(
+ DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
+ ISD::LoadExtType ExtType =
+ N0.getOpcode() == ISD::ZERO_EXTEND
+ ? ISD::ZEXTLOAD
+ : (N0.getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD
+ : ISD::EXTLOAD);
+ SDValue ScalarLoad =
+ DAG.getExtLoad(ExtType, DL, ScalarVT, Load->getChain(), BasePtr,
+ Load->getPointerInfo().getWithOffset(Offset),
+ Load->getValueType(0).getScalarType(),
+ commonAlignment(Load->getAlign(), Offset),
+ Load->getMemOperand()->getFlags(), Load->getAAInfo());
+ DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
+
+ // Extend back to the original type if we looked through an extend above.
+ if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
+ N0.getOpcode() == ISD::SIGN_EXTEND ||
+ N0.getOpcode() == ISD::ANY_EXTEND) &&
+ ScalarVT.getScalarSizeInBits() < VT.getScalarSizeInBits())
+ ScalarLoad = DAG.getNode(N0.getOpcode(), DL, VT, ScalarLoad);
+
+ return ScalarLoad;
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 2b9e334cc7812..2b313fa8ce55f 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -53,18 +53,15 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp q0, q2, [x0]
-; CHECK-NEXT: mov x8, v0.d[1]
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: ucvtf s1, x9
-; CHECK-NEXT: mov x9, v2.d[1]
-; CHECK-NEXT: ucvtf s0, x8
-; CHECK-NEXT: fmov x8, d2
-; CHECK-NEXT: ucvtf s2, x8
+; CHECK-NEXT: ldp x8, x9, [x0]
+; CHECK-NEXT: movi v2.4s, #127, msl #8
+; CHECK-NEXT: ucvtf s0, x9
+; CHECK-NEXT: ucvtf s1, x8
+; CHECK-NEXT: ldp x8, x9, [x0, #16]
; CHECK-NEXT: mov v1.s[1], v0.s[0]
+; CHECK-NEXT: ucvtf s0, x8
+; CHECK-NEXT: mov v1.s[2], v0.s[0]
; CHECK-NEXT: ucvtf s0, x9
-; CHECK-NEXT: mov v1.s[2], v2.s[0]
-; CHECK-NEXT: movi v2.4s, #127, msl #8
; CHECK-NEXT: mov v1.s[3], v0.s[0]
; CHECK-NEXT: movi v0.4s, #1
; CHECK-NEXT: ushr v3.4s, v1.4s, #16
diff --git a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
index 59f887a1143c0..a93203793307a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
@@ -4,10 +4,8 @@
define i32 @foo(ptr %__a) nounwind {
; CHECK-LABEL: foo:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: umov.h w8, v0[0]
-; CHECK-NEXT: umov.h w9, v0[0]
-; CHECK-NEXT: add w0, w9, w8, uxth #1
+; CHECK-NEXT: ldrh w8, [x0]
+; CHECK-NEXT: add w0, w8, w8, lsl #1
; CHECK-NEXT: ret
%tmp18 = load <4 x i16>, ptr %__a, align 8
%vget_lane = extractelement <4 x i16> %tmp18, i32 0
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 114203e46f196..13093cb2204ce 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -105,13 +105,13 @@ define i32 @ldr_int_volatile(ptr %a) nounwind {
; CHECK: Cluster ld/st SU(1) - SU(3)
; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDRQui
; CHECK: SU(3): %{{[0-9]+}}:fpr128 = LDRQui
-define <2 x i64> @ldq_cluster(ptr %p) {
- %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8
+define <4 x i32> @ldq_cluster(ptr %p) {
+ %tmp1 = load <4 x i32>, ptr %p, align 8
%add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2
- %tmp2 = add nsw <2 x i64> %tmp1, %tmp1
- %tmp3 = load <2 x i64>, ptr %add.ptr2, align 8
- %res = mul nsw <2 x i64> %tmp2, %tmp3
- ret <2 x i64> %res
+ %tmp2 = add nsw <4 x i32> %tmp1, %tmp1
+ %tmp3 = load <4 x i32>, ptr %add.ptr2, align 8
+ %res = mul nsw <4 x i32> %tmp2, %tmp3
+ ret <4 x i32> %res
}
; CHECK: ********** MI Scheduling **********
@@ -215,7 +215,7 @@ exit:
; CHECK: ********** MI Scheduling **********
; CHECK: LDURXi_LDRXui:%bb.0 entry
; CHECK: Cluster ld/st SU(3) - SU(4)
-; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
+; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui
;
define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
diff --git a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
index baca159f9dd55..02dfaa19acc9d 100644
--- a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
@@ -4,11 +4,9 @@
define void @autogen_SD19655(ptr %addr, ptr %addrfloat) {
; CHECK-LABEL: autogen_SD19655:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldr q0, [x0]
-; CHECK-NEXT: mov.d x8, v0[1]
-; CHECK-NEXT: fmov x9, d0
-; CHECK-NEXT: scvtf s1, x9
-; CHECK-NEXT: scvtf s0, x8
+; CHECK-NEXT: ldp x8, x9, [x0]
+; CHECK-NEXT: scvtf s0, x9
+; CHECK-NEXT: scvtf s1, x8
; CHECK-NEXT: mov.s v1[1], v0[0]
; CHECK-NEXT: str d1, [x1]
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 6ab703c08b837..121cc30692124 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -1114,16 +1114,10 @@ entry:
}
define ptr @v3ext(<3 x ptr> %a, <3 x ptr> %b, <3 x ptr> %x) {
-; CHECK-SD-LABEL: v3ext:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldr d0, [sp]
-; CHECK-SD-NEXT: fmov x0, d0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: v3ext:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr x0, [sp]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: v3ext:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ldr x0, [sp]
+; CHECK-NEXT: ret
entry:
%c = extractelement <3 x ptr> %x, i32 2
ret ptr %c
diff --git a/llvm/test/CodeGen/AArch64/itofp-bf16.ll b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
index 42641693c4081..0d3ae559449a4 100644
--- a/llvm/test/CodeGen/AArch64/itofp-bf16.ll
+++ b/llvm/test/CodeGen/AArch64/itofp-bf16.ll
@@ -740,162 +740,151 @@ entry:
define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) {
; CHECK-LABEL: stofp_v32i64_v32bf16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: fmov x10, d2
-; CHECK-NEXT: mov x9, v3.d[1]
-; CHECK-NEXT: mov x8, v2.d[1]
-; CHECK-NEXT: fmov x11, d3
-; CHECK-NEXT: fmov x12, d0
-; CHECK-NEXT: movi v3.4s, #1
-; CHECK-NEXT: scvtf s2, x10
-; CHECK-NEXT: mov x10, v0.d[1]
-; CHECK-NEXT: scvtf s19, x9
-; CHECK-NEXT: fmov x9, d1
-; CHECK-NEXT: scvtf s16, x11
-; CHECK-NEXT: mov x11, v6.d[1]
-; CHECK-NEXT: scvtf s0, x12
-; CHECK-NEXT: scvtf s18, x8
-; CHECK-NEXT: mov x8, v1.d[1]
+; CHECK-NEXT: ldp x8, x9, [sp, #32]
+; CHECK-NEXT: mov x13, v2.d[1]
+; CHECK-NEXT: ldp x10, x12, [sp, #96]
+; CHECK-NEXT: fmov x14, d3
+; CHECK-NEXT: movi v17.4s, #1
+; CHECK-NEXT: scvtf s18, x9
+; CHECK-NEXT: scvtf s16, x8
+; CHECK-NEXT: ldp x8, x9, [sp, #48]
+; CHECK-NEXT: scvtf s23, x12
; CHECK-NEXT: scvtf s20, x10
-; CHECK-NEXT: scvtf s17, x9
-; CHECK-NEXT: mov x9, v7.d[1]
-; CHECK-NEXT: mov x10, v4.d[1]
-; CHECK-NEXT: scvtf s21, x11
-; CHECK-NEXT: fmov x11, d6
-; CHECK-NEXT: mov v2.s[1], v18.s[0]
-; CHECK-NEXT: scvtf s25, x8
-; CHECK-NEXT: movi v6.4s, #127, msl #8
-; CHECK-NEXT: mov v0.s[1], v20.s[0]
-; CHECK-NEXT: ldp q24, q20, [sp, #32]
-; CHECK-NEXT: scvtf s22, x9
-; CHECK-NEXT: fmov x9, d4
-; CHECK-NEXT: scvtf s1, x11
-; CHECK-NEXT: scvtf s26, x10
-; CHECK-NEXT: fmov x11, d7
-; CHECK-NEXT: mov v2.s[2], v16.s[0]
-; CHECK-NEXT: ldp q18, q16, [sp]
-; CHECK-NEXT: mov x8, v24.d[1]
-; CHECK-NEXT: scvtf s4, x9
-; CHECK-NEXT: fmov x9, d5
-; CHECK-NEXT: mov v0.s[2], v17.s[0]
-; CHECK-NEXT: mov v1.s[1], v21.s[0]
-; CHECK-NEXT: scvtf s23, x11
-; CHECK-NEXT: mov x11, v5.d[1]
-; CHECK-NEXT: mov v2.s[3], v19.s[0]
+; CHECK-NEXT: mov x10, v0.d[1]
+; CHECK-NEXT: scvtf s27, x13
; CHECK-NEXT: scvtf s21, x8
-; CHECK-NEXT: mov x8, v20.d[1]
-; CHECK-NEXT: scvtf s17, x9
-; CHECK-NEXT: fmov x9, d24
-; CHECK-NEXT: mov v4.s[1], v26.s[0]
-; CHECK-NEXT: mov v0.s[3], v25.s[0]
-; CHECK-NEXT: ldp q26, q24, [sp, #96]
-; CHECK-NEXT: mov v1.s[2], v23.s[0]
-; CHECK-NEXT: ldp q25, q23, [sp, #64]
-; CHECK-NEXT: scvtf s7, x11
-; CHECK-NEXT: scvtf s27, x8
-; CHECK-NEXT: fmov x8, d18
-; CHECK-NEXT: scvtf s5, x9
-; CHECK-NEXT: mov x10, v26.d[1]
-; CHECK-NEXT: mov x9, v18.d[1]
-; CHECK-NEXT: fmov x11, d20
-; CHECK-NEXT: mov v4.s[2], v17.s[0]
-; CHECK-NEXT: mov v1.s[3], v22.s[0]
-; CHECK-NEXT: ushr v19.4s, v2.4s, #16
-; CHECK-NEXT: scvtf s17, x8
-; CHECK-NEXT: fmov x8, d26
-; CHECK-NEXT: add v26.4s, v2.4s, v6.4s
+; CHECK-NEXT: ldp x8, x11, [sp]
+; CHECK-NEXT: mov v16.s[1], v18.s[0]
+; CHECK-NEXT: scvtf s24, x9
+; CHECK-NEXT: movi v18.4s, #127, msl #8
+; CHECK-NEXT: mov v20.s[1], v23.s[0]
; CHECK-NEXT: scvtf s22, x11
-; CHECK-NEXT: mov x11, v25.d[1]
-; CHECK-NEXT: mov v5.s[1], v21.s[0]
-; CHECK-NEXT: scvtf s28, x10
-; CHECK-NEXT: fmov x10, d16
-; CHECK-NEXT: scvtf s21, x9
-; CHECK-NEXT: fmov x9, d25
-; CHECK-NEXT: scvtf s18, x8
-; CHECK-NEXT: mov x8, v16.d[1]
-; CHECK-NEXT: mov v4.s[3], v7.s[0]
-; CHECK-NEXT: and v19.16b, v19.16b, v3.16b
-; CHECK-NEXT: scvtf s16, x10
-; CHECK-NEXT: fmov x10, d24
+; CHECK-NEXT: ldp x11, x12, [sp, #16]
+; CHECK-NEXT: scvtf s19, x8
+; CHECK-NEXT: mov x8, v3.d[1]
+; CHECK-NEXT: mov v16.s[2], v21.s[0]
; CHECK-NEXT: scvtf s25, x11
-; CHECK-NEXT: scvtf s20, x9
-; CHECK-NEXT: mov x9, v24.d[1]
-; CHECK-NEXT: mov v17.s[1], v21.s[0]
-; CHECK-NEXT: fmov x11, d23
-; CHECK-NEXT: mov v18.s[1], v28.s[0]
-; CHECK-NEXT: scvtf s24, x8
-; CHECK-NEXT: scvtf s21, x10
-; CHECK-NEXT: mov x10, v23.d[1]
-; CHECK-NEXT: mov v5.s[2], v22.s[0]
-; CHECK-NEXT: ushr v22.4s, v1.4s, #16
-; CHECK-NEXT: ushr v28.4s, v0.4s, #16
+; CHECK-NEXT: ldp x9, x11, [sp, #112]
+; CHECK-NEXT: mov v19.s[1], v22.s[0]
+; CHECK-NEXT: scvtf s22, x12
+; CHECK-NEXT: scvtf s26, x9
+; CHECK-NEXT: ldp x9, x12, [sp, #64]
; CHECK-NEXT: scvtf s23, x11
-; CHECK-NEXT: mov v20.s[1], v25.s[0]
-; CHECK-NEXT: scvtf s25, x9
-; CHECK-NEXT: mov v17.s[2], v16.s[0]
-; CHECK-NEXT: add v16.4s, v19.4s, v26.4s
-; CHECK-NEXT: ushr v26.4s, v4.4s, #16
-; CHECK-NEXT: mov v18.s[2], v21.s[0]
-; CHECK-NEXT: scvtf s7, x10
-; CHECK-NEXT: and v22.16b, v22.16b, v3.16b
-; CHECK-NEXT: mov v5.s[3], v27.s[0]
-; CHECK-NEXT: and v21.16b, v28.16b, v3.16b
-; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s
-; CHECK-NEXT: mov v20.s[2], v23.s[0]
-; CHECK-NEXT: add v23.4s, v0.4s, v6.4s
+; CHECK-NEXT: mov v16.s[3], v24.s[0]
+; CHECK-NEXT: fmov x11, d2
+; CHECK-NEXT: scvtf s24, x12
+; CHECK-NEXT: scvtf s2, x9
+; CHECK-NEXT: mov x9, v6.d[1]
+; CHECK-NEXT: ldp x12, x13, [sp, #80]
+; CHECK-NEXT: scvtf s21, x11
+; CHECK-NEXT: mov x11, v4.d[1]
+; CHECK-NEXT: mov v19.s[2], v25.s[0]
+; CHECK-NEXT: mov v20.s[2], v26.s[0]
+; CHECK-NEXT: ushr v25.4s, v16.4s, #16
+; CHECK-NEXT: scvtf s26, x14
+; CHECK-NEXT: scvtf s3, x12
+; CHECK-NEXT: mov v2.s[1], v24.s[0]
+; CHECK-NEXT: scvtf s24, x10
+; CHECK-NEXT: fmov x10, d6
+; CHECK-NEXT: fmov x12, d0
+; CHECK-NEXT: scvtf s6, x9
+; CHECK-NEXT: mov v21.s[1], v27.s[0]
+; CHECK-NEXT: scvtf s27, x11
+; CHECK-NEXT: fmov x11, d7
+; CHECK-NEXT: mov v19.s[3], v22.s[0]
+; CHECK-NEXT: mov v20.s[3], v23.s[0]
+; CHECK-NEXT: add v22.4s, v16.4s, v18.4s
+; CHECK-NEXT: mov v2.s[2], v3.s[0]
+; CHECK-NEXT: scvtf s3, x10
+; CHECK-NEXT: fmov x10, d4
+; CHECK-NEXT: scvtf s0, x12
+; CHECK-NEXT: and v23.16b, v25.16b, v17.16b
+; CHECK-NEXT: mov x9, v1.d[1]
+; CHECK-NEXT: fmov x12, d5
+; CHECK-NEXT: mov v21.s[2], v26.s[0]
+; CHECK-NEXT: scvtf s25, x13
+; CHECK-NEXT: scvtf s4, x10
+; CHECK-NEXT: fmov x10, d1
+; CHECK-NEXT: add v26.4s, v20.4s, v18.4s
+; CHECK-NEXT: mov v3.s[1], v6.s[0]
+; CHECK-NEXT: scvtf s6, x11
+; CHECK-NEXT: mov x11, v5.d[1]
+; CHECK-NEXT: scvtf s5, x8
+; CHECK-NEXT: mov v0.s[1], v24.s[0]
+; CHECK-NEXT: add v22.4s, v23.4s, v22.4s
+; CHECK-NEXT: scvtf s1, x10
+; CHECK-NEXT: mov x10, v7.d[1]
+; CHECK-NEXT: scvtf s7, x12
+; CHECK-NEXT: mov v4.s[1], v27.s[0]
+; CHECK-NEXT: ushr v23.4s, v19.4s, #16
+; CHECK-NEXT: mov v2.s[3], v25.s[0]
+; CHECK-NEXT: mov v3.s[2], v6.s[0]
+; CHECK-NEXT: add v25.4s, v19.4s, v18.4s
+; CHECK-NEXT: ushr v24.4s, v20.4s, #16
+; CHECK-NEXT: mov v21.s[3], v5.s[0]
+; CHECK-NEXT: scvtf s5, x11
+; CHECK-NEXT: fcmeq v29.4s, v20.4s, v20.4s
+; CHECK-NEXT: scvtf s6, x10
+; CHECK-NEXT: and v23.16b, v23.16b, v17.16b
+; CHECK-NEXT: mov v0.s[2], v1.s[0]
+; CHECK-NEXT: scvtf s1, x9
+; CHECK-NEXT: mov v4.s[2], v7.s[0]
+; CHECK-NEXT: and v24.16b, v24.16b, v17.16b
+; CHECK-NEXT: fcmeq v7.4s, v16.4s, v16.4s
+; CHECK-NEXT: orr v16.4s, #64, lsl #16
+; CHECK-NEXT: fcmeq v31.4s, v2.4s, v2.4s
+; CHECK-NEXT: add v27.4s, v21.4s, v18.4s
+; CHECK-NEXT: orr v20.4s, #64, lsl #16
+; CHECK-NEXT: mov v3.s[3], v6.s[0]
+; CHECK-NEXT: add v6.4s, v23.4s, v25.4s
+; CHECK-NEXT: ushr v23.4s, v21.4s, #16
+; CHECK-NEXT: mov v0.s[3], v1.s[0]
+; CHECK-NEXT: mov v4.s[3], v5.s[0]
+; CHECK-NEXT: ushr v1.4s, v2.4s, #16
+; CHECK-NEXT: add v24.4s, v24.4s, v26.4s
+; CHECK-NEXT: add v25.4s, v2.4s, v18.4s
+; CHECK-NEXT: fcmeq v5.4s, v19.4s, v19.4s
+; CHECK-NEXT: and v23.16b, v23.16b, v17.16b
+; CHECK-NEXT: orr v19.4s, #64, lsl #16
; CHECK-NEXT: orr v2.4s, #64, lsl #16
-; CHECK-NEXT: mov v17.s[3], v24.s[0]
-; CHECK-NEXT: add v24.4s, v1.4s, v6.4s
-; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s
-; CHECK-NEXT: mov v18.s[3], v25.s[0]
-; CHECK-NEXT: add v25.4s, v4.4s, v6.4s
-; CHECK-NEXT: orr v1.4s, #64, lsl #16
-; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b
-; CHECK-NEXT: mov v20.s[3], v7.s[0]
-; CHECK-NEXT: add v22.4s, v22.4s, v24.4s
-; CHECK-NEXT: add v7.4s, v21.4s, v23.4s
-; CHECK-NEXT: ushr v24.4s, v17.4s, #16
-; CHECK-NEXT: and v23.16b, v26.16b, v3.16b
-; CHECK-NEXT: ushr v26.4s, v5.4s, #16
-; CHECK-NEXT: ushr v28.4s, v18.4s, #16
-; CHECK-NEXT: add v30.4s, v17.4s, v6.4s
-; CHECK-NEXT: add v31.4s, v18.4s, v6.4s
-; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s
-; CHECK-NEXT: orr v0.4s, #64, lsl #16
-; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b
-; CHECK-NEXT: ushr v29.4s, v20.4s, #16
-; CHECK-NEXT: and v24.16b, v24.16b, v3.16b
-; CHECK-NEXT: add v23.4s, v23.4s, v25.4s
-; CHECK-NEXT: and v28.16b, v28.16b, v3.16b
-; CHECK-NEXT: and v25.16b, v26.16b, v3.16b
-; CHECK-NEXT: add v26.4s, v5.4s, v6.4s
-; CHECK-NEXT: add v6.4s, v20.4s, v6.4s
-; CHECK-NEXT: and v3.16b, v29.16b, v3.16b
-; CHECK-NEXT: add v24.4s, v24.4s, v30.4s
-; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s
-; CHECK-NEXT: add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s
-; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT: ushr v28.4s, v3.4s, #16
+; CHECK-NEXT: and v1.16b, v1.16b, v17.16b
+; CHECK-NEXT: bsl v7.16b, v22.16b, v16.16b
+; CHECK-NEXT: ushr v26.4s, v0.4s, #16
+; CHECK-NEXT: ushr v30.4s, v4.4s, #16
+; CHECK-NEXT: add v23.4s, v23.4s, v27.4s
+; CHECK-NEXT: bsl v5.16b, v6.16b, v19.16b
+; CHECK-NEXT: mov v6.16b, v29.16b
+; CHECK-NEXT: and v27.16b, v28.16b, v17.16b
+; CHECK-NEXT: add v28.4s, v3.4s, v18.4s
+; CHECK-NEXT: add v1.4s, v1.4s, v25.4s
+; CHECK-NEXT: and v25.16b, v26.16b, v17.16b
+; CHECK-NEXT: add v26.4s, v0.4s, v18.4s
+; CHECK-NEXT: and v17.16b, v30.16b, v17.16b
+; CHECK-NEXT: add v18.4s, v4.4s, v18.4s
+; CHECK-NEXT: fcmeq v30.4s, v21.4s, v21.4s
+; CHECK-NEXT: orr v21.4s, #64, lsl #16
+; CHECK-NEXT: add v27.4s, v27.4s, v28.4s
+; CHECK-NEXT: fcmeq v28.4s, v3.4s, v3.4s
+; CHECK-NEXT: orr v3.4s, #64, lsl #16
; CHECK-NEXT: add v25.4s, v25.4s, v26.4s
-; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s
+; CHECK-NEXT: fcmeq v26.4s, v0.4s, v0.4s
+; CHECK-NEXT: orr v0.4s, #64, lsl #16
+; CHECK-NEXT: add v17.4s, v17.4s, v18.4s
+; CHECK-NEXT: fcmeq v18.4s, v4.4s, v4.4s
; CHECK-NEXT: orr v4.4s, #64, lsl #16
-; CHECK-NEXT: add v3.4s, v3.4s, v6.4s
-; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s
-; CHECK-NEXT: orr v5.4s, #64, lsl #16
-; CHECK-NEXT: orr v17.4s, #64, lsl #16
-; CHECK-NEXT: orr v18.4s, #64, lsl #16
-; CHECK-NEXT: orr v20.4s, #64, lsl #16
-; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b
-; CHECK-NEXT: mov v7.16b, v30.16b
-; CHECK-NEXT: mov v16.16b, v31.16b
-; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b
-; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b
-; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b
-; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b
-; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b
-; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
-; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h
-; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h
-; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h
+; CHECK-NEXT: mov v16.16b, v30.16b
+; CHECK-NEXT: bsl v6.16b, v24.16b, v20.16b
+; CHECK-NEXT: bif v1.16b, v2.16b, v31.16b
+; CHECK-NEXT: mov ...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
de1f51a
to
9209f4c
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice optimisation.
Just one nit about tests, is there a test with scalable vectors where this shouldn't trigger?
9209f4c
to
94c694f
Compare
Thanks - I think we should be OK to enable it for scalable vectors too, providing that all the extracts are from constant offset. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cheers, LGTM.
Given a vector load that is only extracted from, it is more efficient to perform the individual loads than a single load and many extracts. This adds a late optimization for scalarizing extracted vector loads that do not have any other uses and will not be more efficiently kept in scalar registers.
94c694f
to
3f3e467
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/59/builds/24620 Here is the relevant piece of the build log for the reference
|
Given a vector load that is only extracted from, it is more efficient to perform the individual loads than a single load and many extracts. This adds a late optimization for scalarizing extracted vector loads that do not have any other uses and will not be more efficiently kept in scalar registers.