diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cd7f0e719ad0c..09b31616e0882 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -20467,6 +20467,69 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, } } + // Given an extract(load) or extract(extend(load)), produce a scalar load + // instead to avoid the cross-register-bank copies. + if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() && + VT.isInteger() && isa(N1)) { + SDValue LoadN0 = N0; + // Look through sext/zext and extract_subvector / insert_subvector if + // required. + if ((N0.getOpcode() == ISD::ZERO_EXTEND || + N0.getOpcode() == ISD::SIGN_EXTEND || + N0.getOpcode() == ISD::ANY_EXTEND) && + N0.getOperand(0).hasOneUse()) + LoadN0 = N0.getOperand(0); + unsigned OffsetElts = 0; + if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + OffsetElts = LoadN0.getConstantOperandVal(1); + LoadN0 = LoadN0.getOperand(0); + } + if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR && + LoadN0.getOperand(0).isUndef() && + isNullConstant(LoadN0.getOperand(2)) && + LoadN0.getOperand(1).hasOneUse()) + LoadN0 = LoadN0.getOperand(1); + + // Check all the uses are valid and can be scalarized. We check that all the + // uses are extracts and those extracts are not re-inserted into an + // operation best treated as a vector register. + auto Load = dyn_cast(LoadN0); + if (Load && Load->isSimple() && ISD::isNormalLoad(Load) && + Load->getMemoryVT().isByteSized() && + all_of(N0->uses(), [&](const SDUse &U) { + return U.getResNo() != N0.getResNo() || + (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + !any_of(U.getUser()->uses(), [](const SDUse &U2) { + return U2.getUser()->getOpcode() == + ISD::INSERT_VECTOR_ELT || + U2.getUser()->getOpcode() == ISD::BUILD_VECTOR || + U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR; + })); + })) { + + SDLoc DL(Load); + + // Generate a new scalar load. + unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) * + Load->getValueType(0).getScalarSizeInBits() / 8; + SDValue BasePtr = DAG.getObjectPtrOffset( + DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64)); + ISD::LoadExtType ExtType = + N0.getOpcode() == ISD::ZERO_EXTEND + ? ISD::ZEXTLOAD + : (N0.getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD + : ISD::EXTLOAD); + SDValue ScalarLoad = + DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr, + Load->getPointerInfo().getWithOffset(Offset), + Load->getValueType(0).getScalarType(), + commonAlignment(Load->getAlign(), Offset), + Load->getMemOperand()->getFlags(), Load->getAAInfo()); + DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad); + return ScalarLoad; + } + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll index 2b9e334cc7812..2b313fa8ce55f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -53,18 +53,15 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) { define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) { ; CHECK-LABEL: uitofp_v4i64_to_v4bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: ucvtf s1, x9 -; CHECK-NEXT: mov x9, v2.d[1] -; CHECK-NEXT: ucvtf s0, x8 -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: ucvtf s2, x8 +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: movi v2.4s, #127, msl #8 +; CHECK-NEXT: ucvtf s0, x9 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: ldp x8, x9, [x0, #16] ; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: ucvtf s0, x9 -; CHECK-NEXT: mov v1.s[2], v2.s[0] -; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: mov v1.s[3], v0.s[0] ; CHECK-NEXT: movi v0.4s, #1 ; CHECK-NEXT: ushr v3.4s, v1.4s, #16 diff --git a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll index 59f887a1143c0..a93203793307a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll +++ b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll @@ -4,10 +4,8 @@ define i32 @foo(ptr %__a) nounwind { ; CHECK-LABEL: foo: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: umov.h w8, v0[0] -; CHECK-NEXT: umov.h w9, v0[0] -; CHECK-NEXT: add w0, w9, w8, uxth #1 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: add w0, w8, w8, lsl #1 ; CHECK-NEXT: ret %tmp18 = load <4 x i16>, ptr %__a, align 8 %vget_lane = extractelement <4 x i16> %tmp18, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll index 114203e46f196..13093cb2204ce 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll @@ -105,13 +105,13 @@ define i32 @ldr_int_volatile(ptr %a) nounwind { ; CHECK: Cluster ld/st SU(1) - SU(3) ; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDRQui ; CHECK: SU(3): %{{[0-9]+}}:fpr128 = LDRQui -define <2 x i64> @ldq_cluster(ptr %p) { - %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8 +define <4 x i32> @ldq_cluster(ptr %p) { + %tmp1 = load <4 x i32>, ptr %p, align 8 %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2 - %tmp2 = add nsw <2 x i64> %tmp1, %tmp1 - %tmp3 = load <2 x i64>, ptr %add.ptr2, align 8 - %res = mul nsw <2 x i64> %tmp2, %tmp3 - ret <2 x i64> %res + %tmp2 = add nsw <4 x i32> %tmp1, %tmp1 + %tmp3 = load <4 x i32>, ptr %add.ptr2, align 8 + %res = mul nsw <4 x i32> %tmp2, %tmp3 + ret <4 x i32> %res } ; CHECK: ********** MI Scheduling ********** @@ -215,7 +215,7 @@ exit: ; CHECK: ********** MI Scheduling ********** ; CHECK: LDURXi_LDRXui:%bb.0 entry ; CHECK: Cluster ld/st SU(3) - SU(4) -; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi +; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi ; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui ; define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) { diff --git a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll index baca159f9dd55..02dfaa19acc9d 100644 --- a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll @@ -4,11 +4,9 @@ define void @autogen_SD19655(ptr %addr, ptr %addrfloat) { ; CHECK-LABEL: autogen_SD19655: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov.d x8, v0[1] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: scvtf s1, x9 -; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: scvtf s0, x9 +; CHECK-NEXT: scvtf s1, x8 ; CHECK-NEXT: mov.s v1[1], v0[0] ; CHECK-NEXT: str d1, [x1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll index 6ab703c08b837..121cc30692124 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll @@ -1114,16 +1114,10 @@ entry: } define ptr @v3ext(<3 x ptr> %a, <3 x ptr> %b, <3 x ptr> %x) { -; CHECK-SD-LABEL: v3ext: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldr d0, [sp] -; CHECK-SD-NEXT: fmov x0, d0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v3ext: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr x0, [sp] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v3ext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr x0, [sp] +; CHECK-NEXT: ret entry: %c = extractelement <3 x ptr> %x, i32 2 ret ptr %c diff --git a/llvm/test/CodeGen/AArch64/itofp-bf16.ll b/llvm/test/CodeGen/AArch64/itofp-bf16.ll index 42641693c4081..0d3ae559449a4 100644 --- a/llvm/test/CodeGen/AArch64/itofp-bf16.ll +++ b/llvm/test/CodeGen/AArch64/itofp-bf16.ll @@ -740,162 +740,151 @@ entry: define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: stofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: mov x9, v3.d[1] -; CHECK-NEXT: mov x8, v2.d[1] -; CHECK-NEXT: fmov x11, d3 -; CHECK-NEXT: fmov x12, d0 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: scvtf s2, x10 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: scvtf s19, x9 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: scvtf s16, x11 -; CHECK-NEXT: mov x11, v6.d[1] -; CHECK-NEXT: scvtf s0, x12 -; CHECK-NEXT: scvtf s18, x8 -; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: ldp x8, x9, [sp, #32] +; CHECK-NEXT: mov x13, v2.d[1] +; CHECK-NEXT: ldp x10, x12, [sp, #96] +; CHECK-NEXT: fmov x14, d3 +; CHECK-NEXT: movi v17.4s, #1 +; CHECK-NEXT: scvtf s18, x9 +; CHECK-NEXT: scvtf s16, x8 +; CHECK-NEXT: ldp x8, x9, [sp, #48] +; CHECK-NEXT: scvtf s23, x12 ; CHECK-NEXT: scvtf s20, x10 -; CHECK-NEXT: scvtf s17, x9 -; CHECK-NEXT: mov x9, v7.d[1] -; CHECK-NEXT: mov x10, v4.d[1] -; CHECK-NEXT: scvtf s21, x11 -; CHECK-NEXT: fmov x11, d6 -; CHECK-NEXT: mov v2.s[1], v18.s[0] -; CHECK-NEXT: scvtf s25, x8 -; CHECK-NEXT: movi v6.4s, #127, msl #8 -; CHECK-NEXT: mov v0.s[1], v20.s[0] -; CHECK-NEXT: ldp q24, q20, [sp, #32] -; CHECK-NEXT: scvtf s22, x9 -; CHECK-NEXT: fmov x9, d4 -; CHECK-NEXT: scvtf s1, x11 -; CHECK-NEXT: scvtf s26, x10 -; CHECK-NEXT: fmov x11, d7 -; CHECK-NEXT: mov v2.s[2], v16.s[0] -; CHECK-NEXT: ldp q18, q16, [sp] -; CHECK-NEXT: mov x8, v24.d[1] -; CHECK-NEXT: scvtf s4, x9 -; CHECK-NEXT: fmov x9, d5 -; CHECK-NEXT: mov v0.s[2], v17.s[0] -; CHECK-NEXT: mov v1.s[1], v21.s[0] -; CHECK-NEXT: scvtf s23, x11 -; CHECK-NEXT: mov x11, v5.d[1] -; CHECK-NEXT: mov v2.s[3], v19.s[0] +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: scvtf s27, x13 ; CHECK-NEXT: scvtf s21, x8 -; CHECK-NEXT: mov x8, v20.d[1] -; CHECK-NEXT: scvtf s17, x9 -; CHECK-NEXT: fmov x9, d24 -; CHECK-NEXT: mov v4.s[1], v26.s[0] -; CHECK-NEXT: mov v0.s[3], v25.s[0] -; CHECK-NEXT: ldp q26, q24, [sp, #96] -; CHECK-NEXT: mov v1.s[2], v23.s[0] -; CHECK-NEXT: ldp q25, q23, [sp, #64] -; CHECK-NEXT: scvtf s7, x11 -; CHECK-NEXT: scvtf s27, x8 -; CHECK-NEXT: fmov x8, d18 -; CHECK-NEXT: scvtf s5, x9 -; CHECK-NEXT: mov x10, v26.d[1] -; CHECK-NEXT: mov x9, v18.d[1] -; CHECK-NEXT: fmov x11, d20 -; CHECK-NEXT: mov v4.s[2], v17.s[0] -; CHECK-NEXT: mov v1.s[3], v22.s[0] -; CHECK-NEXT: ushr v19.4s, v2.4s, #16 -; CHECK-NEXT: scvtf s17, x8 -; CHECK-NEXT: fmov x8, d26 -; CHECK-NEXT: add v26.4s, v2.4s, v6.4s +; CHECK-NEXT: ldp x8, x11, [sp] +; CHECK-NEXT: mov v16.s[1], v18.s[0] +; CHECK-NEXT: scvtf s24, x9 +; CHECK-NEXT: movi v18.4s, #127, msl #8 +; CHECK-NEXT: mov v20.s[1], v23.s[0] ; CHECK-NEXT: scvtf s22, x11 -; CHECK-NEXT: mov x11, v25.d[1] -; CHECK-NEXT: mov v5.s[1], v21.s[0] -; CHECK-NEXT: scvtf s28, x10 -; CHECK-NEXT: fmov x10, d16 -; CHECK-NEXT: scvtf s21, x9 -; CHECK-NEXT: fmov x9, d25 -; CHECK-NEXT: scvtf s18, x8 -; CHECK-NEXT: mov x8, v16.d[1] -; CHECK-NEXT: mov v4.s[3], v7.s[0] -; CHECK-NEXT: and v19.16b, v19.16b, v3.16b -; CHECK-NEXT: scvtf s16, x10 -; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: ldp x11, x12, [sp, #16] +; CHECK-NEXT: scvtf s19, x8 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: mov v16.s[2], v21.s[0] ; CHECK-NEXT: scvtf s25, x11 -; CHECK-NEXT: scvtf s20, x9 -; CHECK-NEXT: mov x9, v24.d[1] -; CHECK-NEXT: mov v17.s[1], v21.s[0] -; CHECK-NEXT: fmov x11, d23 -; CHECK-NEXT: mov v18.s[1], v28.s[0] -; CHECK-NEXT: scvtf s24, x8 -; CHECK-NEXT: scvtf s21, x10 -; CHECK-NEXT: mov x10, v23.d[1] -; CHECK-NEXT: mov v5.s[2], v22.s[0] -; CHECK-NEXT: ushr v22.4s, v1.4s, #16 -; CHECK-NEXT: ushr v28.4s, v0.4s, #16 +; CHECK-NEXT: ldp x9, x11, [sp, #112] +; CHECK-NEXT: mov v19.s[1], v22.s[0] +; CHECK-NEXT: scvtf s22, x12 +; CHECK-NEXT: scvtf s26, x9 +; CHECK-NEXT: ldp x9, x12, [sp, #64] ; CHECK-NEXT: scvtf s23, x11 -; CHECK-NEXT: mov v20.s[1], v25.s[0] -; CHECK-NEXT: scvtf s25, x9 -; CHECK-NEXT: mov v17.s[2], v16.s[0] -; CHECK-NEXT: add v16.4s, v19.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v4.4s, #16 -; CHECK-NEXT: mov v18.s[2], v21.s[0] -; CHECK-NEXT: scvtf s7, x10 -; CHECK-NEXT: and v22.16b, v22.16b, v3.16b -; CHECK-NEXT: mov v5.s[3], v27.s[0] -; CHECK-NEXT: and v21.16b, v28.16b, v3.16b -; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s -; CHECK-NEXT: mov v20.s[2], v23.s[0] -; CHECK-NEXT: add v23.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v16.s[3], v24.s[0] +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: scvtf s24, x12 +; CHECK-NEXT: scvtf s2, x9 +; CHECK-NEXT: mov x9, v6.d[1] +; CHECK-NEXT: ldp x12, x13, [sp, #80] +; CHECK-NEXT: scvtf s21, x11 +; CHECK-NEXT: mov x11, v4.d[1] +; CHECK-NEXT: mov v19.s[2], v25.s[0] +; CHECK-NEXT: mov v20.s[2], v26.s[0] +; CHECK-NEXT: ushr v25.4s, v16.4s, #16 +; CHECK-NEXT: scvtf s26, x14 +; CHECK-NEXT: scvtf s3, x12 +; CHECK-NEXT: mov v2.s[1], v24.s[0] +; CHECK-NEXT: scvtf s24, x10 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: scvtf s6, x9 +; CHECK-NEXT: mov v21.s[1], v27.s[0] +; CHECK-NEXT: scvtf s27, x11 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: mov v19.s[3], v22.s[0] +; CHECK-NEXT: mov v20.s[3], v23.s[0] +; CHECK-NEXT: add v22.4s, v16.4s, v18.4s +; CHECK-NEXT: mov v2.s[2], v3.s[0] +; CHECK-NEXT: scvtf s3, x10 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: scvtf s0, x12 +; CHECK-NEXT: and v23.16b, v25.16b, v17.16b +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: fmov x12, d5 +; CHECK-NEXT: mov v21.s[2], v26.s[0] +; CHECK-NEXT: scvtf s25, x13 +; CHECK-NEXT: scvtf s4, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: add v26.4s, v20.4s, v18.4s +; CHECK-NEXT: mov v3.s[1], v6.s[0] +; CHECK-NEXT: scvtf s6, x11 +; CHECK-NEXT: mov x11, v5.d[1] +; CHECK-NEXT: scvtf s5, x8 +; CHECK-NEXT: mov v0.s[1], v24.s[0] +; CHECK-NEXT: add v22.4s, v23.4s, v22.4s +; CHECK-NEXT: scvtf s1, x10 +; CHECK-NEXT: mov x10, v7.d[1] +; CHECK-NEXT: scvtf s7, x12 +; CHECK-NEXT: mov v4.s[1], v27.s[0] +; CHECK-NEXT: ushr v23.4s, v19.4s, #16 +; CHECK-NEXT: mov v2.s[3], v25.s[0] +; CHECK-NEXT: mov v3.s[2], v6.s[0] +; CHECK-NEXT: add v25.4s, v19.4s, v18.4s +; CHECK-NEXT: ushr v24.4s, v20.4s, #16 +; CHECK-NEXT: mov v21.s[3], v5.s[0] +; CHECK-NEXT: scvtf s5, x11 +; CHECK-NEXT: fcmeq v29.4s, v20.4s, v20.4s +; CHECK-NEXT: scvtf s6, x10 +; CHECK-NEXT: and v23.16b, v23.16b, v17.16b +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: scvtf s1, x9 +; CHECK-NEXT: mov v4.s[2], v7.s[0] +; CHECK-NEXT: and v24.16b, v24.16b, v17.16b +; CHECK-NEXT: fcmeq v7.4s, v16.4s, v16.4s +; CHECK-NEXT: orr v16.4s, #64, lsl #16 +; CHECK-NEXT: fcmeq v31.4s, v2.4s, v2.4s +; CHECK-NEXT: add v27.4s, v21.4s, v18.4s +; CHECK-NEXT: orr v20.4s, #64, lsl #16 +; CHECK-NEXT: mov v3.s[3], v6.s[0] +; CHECK-NEXT: add v6.4s, v23.4s, v25.4s +; CHECK-NEXT: ushr v23.4s, v21.4s, #16 +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: mov v4.s[3], v5.s[0] +; CHECK-NEXT: ushr v1.4s, v2.4s, #16 +; CHECK-NEXT: add v24.4s, v24.4s, v26.4s +; CHECK-NEXT: add v25.4s, v2.4s, v18.4s +; CHECK-NEXT: fcmeq v5.4s, v19.4s, v19.4s +; CHECK-NEXT: and v23.16b, v23.16b, v17.16b +; CHECK-NEXT: orr v19.4s, #64, lsl #16 ; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: mov v17.s[3], v24.s[0] -; CHECK-NEXT: add v24.4s, v1.4s, v6.4s -; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s -; CHECK-NEXT: mov v18.s[3], v25.s[0] -; CHECK-NEXT: add v25.4s, v4.4s, v6.4s -; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b -; CHECK-NEXT: mov v20.s[3], v7.s[0] -; CHECK-NEXT: add v22.4s, v22.4s, v24.4s -; CHECK-NEXT: add v7.4s, v21.4s, v23.4s -; CHECK-NEXT: ushr v24.4s, v17.4s, #16 -; CHECK-NEXT: and v23.16b, v26.16b, v3.16b -; CHECK-NEXT: ushr v26.4s, v5.4s, #16 -; CHECK-NEXT: ushr v28.4s, v18.4s, #16 -; CHECK-NEXT: add v30.4s, v17.4s, v6.4s -; CHECK-NEXT: add v31.4s, v18.4s, v6.4s -; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b -; CHECK-NEXT: ushr v29.4s, v20.4s, #16 -; CHECK-NEXT: and v24.16b, v24.16b, v3.16b -; CHECK-NEXT: add v23.4s, v23.4s, v25.4s -; CHECK-NEXT: and v28.16b, v28.16b, v3.16b -; CHECK-NEXT: and v25.16b, v26.16b, v3.16b -; CHECK-NEXT: add v26.4s, v5.4s, v6.4s -; CHECK-NEXT: add v6.4s, v20.4s, v6.4s -; CHECK-NEXT: and v3.16b, v29.16b, v3.16b -; CHECK-NEXT: add v24.4s, v24.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s -; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s -; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: ushr v28.4s, v3.4s, #16 +; CHECK-NEXT: and v1.16b, v1.16b, v17.16b +; CHECK-NEXT: bsl v7.16b, v22.16b, v16.16b +; CHECK-NEXT: ushr v26.4s, v0.4s, #16 +; CHECK-NEXT: ushr v30.4s, v4.4s, #16 +; CHECK-NEXT: add v23.4s, v23.4s, v27.4s +; CHECK-NEXT: bsl v5.16b, v6.16b, v19.16b +; CHECK-NEXT: mov v6.16b, v29.16b +; CHECK-NEXT: and v27.16b, v28.16b, v17.16b +; CHECK-NEXT: add v28.4s, v3.4s, v18.4s +; CHECK-NEXT: add v1.4s, v1.4s, v25.4s +; CHECK-NEXT: and v25.16b, v26.16b, v17.16b +; CHECK-NEXT: add v26.4s, v0.4s, v18.4s +; CHECK-NEXT: and v17.16b, v30.16b, v17.16b +; CHECK-NEXT: add v18.4s, v4.4s, v18.4s +; CHECK-NEXT: fcmeq v30.4s, v21.4s, v21.4s +; CHECK-NEXT: orr v21.4s, #64, lsl #16 +; CHECK-NEXT: add v27.4s, v27.4s, v28.4s +; CHECK-NEXT: fcmeq v28.4s, v3.4s, v3.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 ; CHECK-NEXT: add v25.4s, v25.4s, v26.4s -; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s +; CHECK-NEXT: fcmeq v26.4s, v0.4s, v0.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 +; CHECK-NEXT: add v17.4s, v17.4s, v18.4s +; CHECK-NEXT: fcmeq v18.4s, v4.4s, v4.4s ; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: add v3.4s, v3.4s, v6.4s -; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s -; CHECK-NEXT: orr v5.4s, #64, lsl #16 -; CHECK-NEXT: orr v17.4s, #64, lsl #16 -; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v20.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b -; CHECK-NEXT: mov v7.16b, v30.16b -; CHECK-NEXT: mov v16.16b, v31.16b -; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b -; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b -; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b -; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b -; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h -; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h -; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h +; CHECK-NEXT: mov v16.16b, v30.16b +; CHECK-NEXT: bsl v6.16b, v24.16b, v20.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v31.16b +; CHECK-NEXT: mov v19.16b, v28.16b +; CHECK-NEXT: uzp2 v2.8h, v5.8h, v7.8h +; CHECK-NEXT: bit v0.16b, v25.16b, v26.16b +; CHECK-NEXT: bsl v16.16b, v23.16b, v21.16b +; CHECK-NEXT: bit v4.16b, v17.16b, v18.16b +; CHECK-NEXT: bsl v19.16b, v27.16b, v3.16b +; CHECK-NEXT: uzp2 v3.8h, v1.8h, v6.8h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v16.8h +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v19.8h ; CHECK-NEXT: ret entry: %c = sitofp <32 x i64> %a to <32 x bfloat> @@ -905,162 +894,151 @@ entry: define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: utofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: mov x9, v3.d[1] -; CHECK-NEXT: mov x8, v2.d[1] -; CHECK-NEXT: fmov x11, d3 -; CHECK-NEXT: fmov x12, d0 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ucvtf s2, x10 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: ucvtf s19, x9 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: ucvtf s16, x11 -; CHECK-NEXT: mov x11, v6.d[1] -; CHECK-NEXT: ucvtf s0, x12 -; CHECK-NEXT: ucvtf s18, x8 -; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: ldp x8, x9, [sp, #32] +; CHECK-NEXT: mov x13, v2.d[1] +; CHECK-NEXT: ldp x10, x12, [sp, #96] +; CHECK-NEXT: fmov x14, d3 +; CHECK-NEXT: movi v17.4s, #1 +; CHECK-NEXT: ucvtf s18, x9 +; CHECK-NEXT: ucvtf s16, x8 +; CHECK-NEXT: ldp x8, x9, [sp, #48] +; CHECK-NEXT: ucvtf s23, x12 ; CHECK-NEXT: ucvtf s20, x10 -; CHECK-NEXT: ucvtf s17, x9 -; CHECK-NEXT: mov x9, v7.d[1] -; CHECK-NEXT: mov x10, v4.d[1] -; CHECK-NEXT: ucvtf s21, x11 -; CHECK-NEXT: fmov x11, d6 -; CHECK-NEXT: mov v2.s[1], v18.s[0] -; CHECK-NEXT: ucvtf s25, x8 -; CHECK-NEXT: movi v6.4s, #127, msl #8 -; CHECK-NEXT: mov v0.s[1], v20.s[0] -; CHECK-NEXT: ldp q24, q20, [sp, #32] -; CHECK-NEXT: ucvtf s22, x9 -; CHECK-NEXT: fmov x9, d4 -; CHECK-NEXT: ucvtf s1, x11 -; CHECK-NEXT: ucvtf s26, x10 -; CHECK-NEXT: fmov x11, d7 -; CHECK-NEXT: mov v2.s[2], v16.s[0] -; CHECK-NEXT: ldp q18, q16, [sp] -; CHECK-NEXT: mov x8, v24.d[1] -; CHECK-NEXT: ucvtf s4, x9 -; CHECK-NEXT: fmov x9, d5 -; CHECK-NEXT: mov v0.s[2], v17.s[0] -; CHECK-NEXT: mov v1.s[1], v21.s[0] -; CHECK-NEXT: ucvtf s23, x11 -; CHECK-NEXT: mov x11, v5.d[1] -; CHECK-NEXT: mov v2.s[3], v19.s[0] +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: ucvtf s27, x13 ; CHECK-NEXT: ucvtf s21, x8 -; CHECK-NEXT: mov x8, v20.d[1] -; CHECK-NEXT: ucvtf s17, x9 -; CHECK-NEXT: fmov x9, d24 -; CHECK-NEXT: mov v4.s[1], v26.s[0] -; CHECK-NEXT: mov v0.s[3], v25.s[0] -; CHECK-NEXT: ldp q26, q24, [sp, #96] -; CHECK-NEXT: mov v1.s[2], v23.s[0] -; CHECK-NEXT: ldp q25, q23, [sp, #64] -; CHECK-NEXT: ucvtf s7, x11 -; CHECK-NEXT: ucvtf s27, x8 -; CHECK-NEXT: fmov x8, d18 -; CHECK-NEXT: ucvtf s5, x9 -; CHECK-NEXT: mov x10, v26.d[1] -; CHECK-NEXT: mov x9, v18.d[1] -; CHECK-NEXT: fmov x11, d20 -; CHECK-NEXT: mov v4.s[2], v17.s[0] -; CHECK-NEXT: mov v1.s[3], v22.s[0] -; CHECK-NEXT: ushr v19.4s, v2.4s, #16 -; CHECK-NEXT: ucvtf s17, x8 -; CHECK-NEXT: fmov x8, d26 -; CHECK-NEXT: add v26.4s, v2.4s, v6.4s +; CHECK-NEXT: ldp x8, x11, [sp] +; CHECK-NEXT: mov v16.s[1], v18.s[0] +; CHECK-NEXT: ucvtf s24, x9 +; CHECK-NEXT: movi v18.4s, #127, msl #8 +; CHECK-NEXT: mov v20.s[1], v23.s[0] ; CHECK-NEXT: ucvtf s22, x11 -; CHECK-NEXT: mov x11, v25.d[1] -; CHECK-NEXT: mov v5.s[1], v21.s[0] -; CHECK-NEXT: ucvtf s28, x10 -; CHECK-NEXT: fmov x10, d16 -; CHECK-NEXT: ucvtf s21, x9 -; CHECK-NEXT: fmov x9, d25 -; CHECK-NEXT: ucvtf s18, x8 -; CHECK-NEXT: mov x8, v16.d[1] -; CHECK-NEXT: mov v4.s[3], v7.s[0] -; CHECK-NEXT: and v19.16b, v19.16b, v3.16b -; CHECK-NEXT: ucvtf s16, x10 -; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: ldp x11, x12, [sp, #16] +; CHECK-NEXT: ucvtf s19, x8 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: mov v16.s[2], v21.s[0] ; CHECK-NEXT: ucvtf s25, x11 -; CHECK-NEXT: ucvtf s20, x9 -; CHECK-NEXT: mov x9, v24.d[1] -; CHECK-NEXT: mov v17.s[1], v21.s[0] -; CHECK-NEXT: fmov x11, d23 -; CHECK-NEXT: mov v18.s[1], v28.s[0] -; CHECK-NEXT: ucvtf s24, x8 -; CHECK-NEXT: ucvtf s21, x10 -; CHECK-NEXT: mov x10, v23.d[1] -; CHECK-NEXT: mov v5.s[2], v22.s[0] -; CHECK-NEXT: ushr v22.4s, v1.4s, #16 -; CHECK-NEXT: ushr v28.4s, v0.4s, #16 +; CHECK-NEXT: ldp x9, x11, [sp, #112] +; CHECK-NEXT: mov v19.s[1], v22.s[0] +; CHECK-NEXT: ucvtf s22, x12 +; CHECK-NEXT: ucvtf s26, x9 +; CHECK-NEXT: ldp x9, x12, [sp, #64] ; CHECK-NEXT: ucvtf s23, x11 -; CHECK-NEXT: mov v20.s[1], v25.s[0] -; CHECK-NEXT: ucvtf s25, x9 -; CHECK-NEXT: mov v17.s[2], v16.s[0] -; CHECK-NEXT: add v16.4s, v19.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v4.4s, #16 -; CHECK-NEXT: mov v18.s[2], v21.s[0] -; CHECK-NEXT: ucvtf s7, x10 -; CHECK-NEXT: and v22.16b, v22.16b, v3.16b -; CHECK-NEXT: mov v5.s[3], v27.s[0] -; CHECK-NEXT: and v21.16b, v28.16b, v3.16b -; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s -; CHECK-NEXT: mov v20.s[2], v23.s[0] -; CHECK-NEXT: add v23.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v16.s[3], v24.s[0] +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: ucvtf s24, x12 +; CHECK-NEXT: ucvtf s2, x9 +; CHECK-NEXT: mov x9, v6.d[1] +; CHECK-NEXT: ldp x12, x13, [sp, #80] +; CHECK-NEXT: ucvtf s21, x11 +; CHECK-NEXT: mov x11, v4.d[1] +; CHECK-NEXT: mov v19.s[2], v25.s[0] +; CHECK-NEXT: mov v20.s[2], v26.s[0] +; CHECK-NEXT: ushr v25.4s, v16.4s, #16 +; CHECK-NEXT: ucvtf s26, x14 +; CHECK-NEXT: ucvtf s3, x12 +; CHECK-NEXT: mov v2.s[1], v24.s[0] +; CHECK-NEXT: ucvtf s24, x10 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: ucvtf s6, x9 +; CHECK-NEXT: mov v21.s[1], v27.s[0] +; CHECK-NEXT: ucvtf s27, x11 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: mov v19.s[3], v22.s[0] +; CHECK-NEXT: mov v20.s[3], v23.s[0] +; CHECK-NEXT: add v22.4s, v16.4s, v18.4s +; CHECK-NEXT: mov v2.s[2], v3.s[0] +; CHECK-NEXT: ucvtf s3, x10 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: ucvtf s0, x12 +; CHECK-NEXT: and v23.16b, v25.16b, v17.16b +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: fmov x12, d5 +; CHECK-NEXT: mov v21.s[2], v26.s[0] +; CHECK-NEXT: ucvtf s25, x13 +; CHECK-NEXT: ucvtf s4, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: add v26.4s, v20.4s, v18.4s +; CHECK-NEXT: mov v3.s[1], v6.s[0] +; CHECK-NEXT: ucvtf s6, x11 +; CHECK-NEXT: mov x11, v5.d[1] +; CHECK-NEXT: ucvtf s5, x8 +; CHECK-NEXT: mov v0.s[1], v24.s[0] +; CHECK-NEXT: add v22.4s, v23.4s, v22.4s +; CHECK-NEXT: ucvtf s1, x10 +; CHECK-NEXT: mov x10, v7.d[1] +; CHECK-NEXT: ucvtf s7, x12 +; CHECK-NEXT: mov v4.s[1], v27.s[0] +; CHECK-NEXT: ushr v23.4s, v19.4s, #16 +; CHECK-NEXT: mov v2.s[3], v25.s[0] +; CHECK-NEXT: mov v3.s[2], v6.s[0] +; CHECK-NEXT: add v25.4s, v19.4s, v18.4s +; CHECK-NEXT: ushr v24.4s, v20.4s, #16 +; CHECK-NEXT: mov v21.s[3], v5.s[0] +; CHECK-NEXT: ucvtf s5, x11 +; CHECK-NEXT: fcmeq v29.4s, v20.4s, v20.4s +; CHECK-NEXT: ucvtf s6, x10 +; CHECK-NEXT: and v23.16b, v23.16b, v17.16b +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: ucvtf s1, x9 +; CHECK-NEXT: mov v4.s[2], v7.s[0] +; CHECK-NEXT: and v24.16b, v24.16b, v17.16b +; CHECK-NEXT: fcmeq v7.4s, v16.4s, v16.4s +; CHECK-NEXT: orr v16.4s, #64, lsl #16 +; CHECK-NEXT: fcmeq v31.4s, v2.4s, v2.4s +; CHECK-NEXT: add v27.4s, v21.4s, v18.4s +; CHECK-NEXT: orr v20.4s, #64, lsl #16 +; CHECK-NEXT: mov v3.s[3], v6.s[0] +; CHECK-NEXT: add v6.4s, v23.4s, v25.4s +; CHECK-NEXT: ushr v23.4s, v21.4s, #16 +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: mov v4.s[3], v5.s[0] +; CHECK-NEXT: ushr v1.4s, v2.4s, #16 +; CHECK-NEXT: add v24.4s, v24.4s, v26.4s +; CHECK-NEXT: add v25.4s, v2.4s, v18.4s +; CHECK-NEXT: fcmeq v5.4s, v19.4s, v19.4s +; CHECK-NEXT: and v23.16b, v23.16b, v17.16b +; CHECK-NEXT: orr v19.4s, #64, lsl #16 ; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: mov v17.s[3], v24.s[0] -; CHECK-NEXT: add v24.4s, v1.4s, v6.4s -; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s -; CHECK-NEXT: mov v18.s[3], v25.s[0] -; CHECK-NEXT: add v25.4s, v4.4s, v6.4s -; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b -; CHECK-NEXT: mov v20.s[3], v7.s[0] -; CHECK-NEXT: add v22.4s, v22.4s, v24.4s -; CHECK-NEXT: add v7.4s, v21.4s, v23.4s -; CHECK-NEXT: ushr v24.4s, v17.4s, #16 -; CHECK-NEXT: and v23.16b, v26.16b, v3.16b -; CHECK-NEXT: ushr v26.4s, v5.4s, #16 -; CHECK-NEXT: ushr v28.4s, v18.4s, #16 -; CHECK-NEXT: add v30.4s, v17.4s, v6.4s -; CHECK-NEXT: add v31.4s, v18.4s, v6.4s -; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b -; CHECK-NEXT: ushr v29.4s, v20.4s, #16 -; CHECK-NEXT: and v24.16b, v24.16b, v3.16b -; CHECK-NEXT: add v23.4s, v23.4s, v25.4s -; CHECK-NEXT: and v28.16b, v28.16b, v3.16b -; CHECK-NEXT: and v25.16b, v26.16b, v3.16b -; CHECK-NEXT: add v26.4s, v5.4s, v6.4s -; CHECK-NEXT: add v6.4s, v20.4s, v6.4s -; CHECK-NEXT: and v3.16b, v29.16b, v3.16b -; CHECK-NEXT: add v24.4s, v24.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s -; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s -; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: ushr v28.4s, v3.4s, #16 +; CHECK-NEXT: and v1.16b, v1.16b, v17.16b +; CHECK-NEXT: bsl v7.16b, v22.16b, v16.16b +; CHECK-NEXT: ushr v26.4s, v0.4s, #16 +; CHECK-NEXT: ushr v30.4s, v4.4s, #16 +; CHECK-NEXT: add v23.4s, v23.4s, v27.4s +; CHECK-NEXT: bsl v5.16b, v6.16b, v19.16b +; CHECK-NEXT: mov v6.16b, v29.16b +; CHECK-NEXT: and v27.16b, v28.16b, v17.16b +; CHECK-NEXT: add v28.4s, v3.4s, v18.4s +; CHECK-NEXT: add v1.4s, v1.4s, v25.4s +; CHECK-NEXT: and v25.16b, v26.16b, v17.16b +; CHECK-NEXT: add v26.4s, v0.4s, v18.4s +; CHECK-NEXT: and v17.16b, v30.16b, v17.16b +; CHECK-NEXT: add v18.4s, v4.4s, v18.4s +; CHECK-NEXT: fcmeq v30.4s, v21.4s, v21.4s +; CHECK-NEXT: orr v21.4s, #64, lsl #16 +; CHECK-NEXT: add v27.4s, v27.4s, v28.4s +; CHECK-NEXT: fcmeq v28.4s, v3.4s, v3.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 ; CHECK-NEXT: add v25.4s, v25.4s, v26.4s -; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s +; CHECK-NEXT: fcmeq v26.4s, v0.4s, v0.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 +; CHECK-NEXT: add v17.4s, v17.4s, v18.4s +; CHECK-NEXT: fcmeq v18.4s, v4.4s, v4.4s ; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: add v3.4s, v3.4s, v6.4s -; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s -; CHECK-NEXT: orr v5.4s, #64, lsl #16 -; CHECK-NEXT: orr v17.4s, #64, lsl #16 -; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v20.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b -; CHECK-NEXT: mov v7.16b, v30.16b -; CHECK-NEXT: mov v16.16b, v31.16b -; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b -; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b -; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b -; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b -; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h -; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h -; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h +; CHECK-NEXT: mov v16.16b, v30.16b +; CHECK-NEXT: bsl v6.16b, v24.16b, v20.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v31.16b +; CHECK-NEXT: mov v19.16b, v28.16b +; CHECK-NEXT: uzp2 v2.8h, v5.8h, v7.8h +; CHECK-NEXT: bit v0.16b, v25.16b, v26.16b +; CHECK-NEXT: bsl v16.16b, v23.16b, v21.16b +; CHECK-NEXT: bit v4.16b, v17.16b, v18.16b +; CHECK-NEXT: bsl v19.16b, v27.16b, v3.16b +; CHECK-NEXT: uzp2 v3.8h, v1.8h, v6.8h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v16.8h +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v19.8h ; CHECK-NEXT: ret entry: %c = uitofp <32 x i64> %a to <32 x bfloat> diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll index c91de8f3a0a47..e3c623371448b 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -8,224 +8,209 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-LABEL: run_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #208 -; CHECK-NEXT: .cfi_def_cfa_offset 208 -; CHECK-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill -; CHECK-NEXT: str x23, [sp, #160] // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #176] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #192] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #192 +; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: stp d15, d14, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #128] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #144] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #160] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #176] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -48 -; CHECK-NEXT: .cfi_offset b8, -56 -; CHECK-NEXT: .cfi_offset b9, -64 -; CHECK-NEXT: .cfi_offset b10, -72 -; CHECK-NEXT: .cfi_offset b11, -80 -; CHECK-NEXT: .cfi_offset b12, -88 -; CHECK-NEXT: .cfi_offset b13, -96 -; CHECK-NEXT: .cfi_offset b14, -104 -; CHECK-NEXT: .cfi_offset b15, -112 -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: adrp x14, B+48 +; CHECK-NEXT: add x14, x14, :lo12:B+48 +; CHECK-NEXT: // implicit-def: $q18 ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: adrp x9, B+48 -; CHECK-NEXT: add x9, x9, :lo12:B+48 +; CHECK-NEXT: mov w9, #8 // =0x8 ; CHECK-NEXT: adrp x10, A ; CHECK-NEXT: add x10, x10, :lo12:A ; CHECK-NEXT: mov x11, xzr -; CHECK-NEXT: // kill: killed $q1 -; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: // kill: killed $q18 +; CHECK-NEXT: // implicit-def: $q18 ; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: mov x13, x14 ; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: // implicit-def: $q2 ; CHECK-NEXT: // implicit-def: $q3 ; CHECK-NEXT: // implicit-def: $q4 ; CHECK-NEXT: // implicit-def: $q5 -; CHECK-NEXT: // implicit-def: $q7 -; CHECK-NEXT: // implicit-def: $q10 -; CHECK-NEXT: // implicit-def: $q17 ; CHECK-NEXT: // implicit-def: $q6 -; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: // implicit-def: $q17 +; CHECK-NEXT: // implicit-def: $q7 ; CHECK-NEXT: // implicit-def: $q19 ; CHECK-NEXT: // implicit-def: $q20 ; CHECK-NEXT: // implicit-def: $q21 ; CHECK-NEXT: // implicit-def: $q22 -; CHECK-NEXT: // implicit-def: $q23 ; CHECK-NEXT: // implicit-def: $q24 -; CHECK-NEXT: // implicit-def: $q9 +; CHECK-NEXT: // implicit-def: $q23 +; CHECK-NEXT: // implicit-def: $q25 +; CHECK-NEXT: // implicit-def: $q26 ; CHECK-NEXT: // implicit-def: $q27 -; CHECK-NEXT: // implicit-def: $q12 -; CHECK-NEXT: // implicit-def: $q28 -; CHECK-NEXT: // implicit-def: $q14 -; CHECK-NEXT: // implicit-def: $q15 -; CHECK-NEXT: // implicit-def: $q29 ; CHECK-NEXT: // implicit-def: $q30 +; CHECK-NEXT: // implicit-def: $q8 ; CHECK-NEXT: // implicit-def: $q11 -; CHECK-NEXT: // implicit-def: $q31 +; CHECK-NEXT: // implicit-def: $q12 +; CHECK-NEXT: // implicit-def: $q29 ; CHECK-NEXT: // implicit-def: $q13 -; CHECK-NEXT: // kill: killed $q1 -; CHECK-NEXT: // implicit-def: $q1 -; CHECK-NEXT: // kill: killed $q1 +; CHECK-NEXT: // implicit-def: $q10 +; CHECK-NEXT: // implicit-def: $q15 +; CHECK-NEXT: // kill: killed $q18 +; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: // kill: killed $q18 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill -; CHECK-NEXT: ldr q15, [x8] +; CHECK-NEXT: ldr x17, [x8] ; CHECK-NEXT: ldr x15, [x8] -; CHECK-NEXT: str q14, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: add x20, x10, x11 -; CHECK-NEXT: mov v8.16b, v28.16b -; CHECK-NEXT: fmov x2, d15 -; CHECK-NEXT: mov x17, v15.d[1] -; CHECK-NEXT: ldr q14, [x8] +; CHECK-NEXT: mov v18.16b, v0.16b +; CHECK-NEXT: ldr x16, [x9] +; CHECK-NEXT: stp q15, q4, [sp] // 32-byte Folded Spill +; CHECK-NEXT: add x5, x10, x11 +; CHECK-NEXT: mul x1, x15, x17 +; CHECK-NEXT: ldr x2, [x13], #64 +; CHECK-NEXT: ldr x5, [x5, #128] +; CHECK-NEXT: stp q7, q23, [sp, #32] // 32-byte Folded Spill +; CHECK-NEXT: ldr x14, [x14, #8] +; CHECK-NEXT: mul x0, x17, x17 +; CHECK-NEXT: ldr q23, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: mov v9.16b, v30.16b +; CHECK-NEXT: mov v30.16b, v25.16b +; CHECK-NEXT: mov v25.16b, v20.16b +; CHECK-NEXT: mov v20.16b, v6.16b +; CHECK-NEXT: mul x18, x16, x17 +; CHECK-NEXT: mov v6.16b, v1.16b ; CHECK-NEXT: mov v28.16b, v24.16b -; CHECK-NEXT: mov v24.16b, v20.16b -; CHECK-NEXT: mov v20.16b, v17.16b -; CHECK-NEXT: fmov x13, d14 -; CHECK-NEXT: mov x16, v14.d[1] -; CHECK-NEXT: mov v17.16b, v5.16b -; CHECK-NEXT: mul x3, x2, x15 -; CHECK-NEXT: ldr q14, [x9], #64 -; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x6, [x8] -; CHECK-NEXT: ldr x20, [x20, #128] -; CHECK-NEXT: mul x1, x17, x15 -; CHECK-NEXT: mov x14, v14.d[1] -; CHECK-NEXT: fmov x5, d14 -; CHECK-NEXT: mov v29.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v0.16b -; CHECK-NEXT: mov v25.16b, v6.16b -; CHECK-NEXT: mul x18, x13, x15 -; CHECK-NEXT: mov v6.16b, v2.16b -; CHECK-NEXT: mov v26.16b, v22.16b -; CHECK-NEXT: fmov d15, x3 -; CHECK-NEXT: mov v22.16b, v18.16b -; CHECK-NEXT: mov v18.16b, v7.16b -; CHECK-NEXT: mul x0, x16, x15 -; CHECK-NEXT: mov v7.16b, v3.16b -; CHECK-NEXT: mov v16.16b, v4.16b +; CHECK-NEXT: fmov d14, x1 +; CHECK-NEXT: mov v24.16b, v19.16b +; CHECK-NEXT: mov v19.16b, v5.16b +; CHECK-NEXT: mul x4, x2, x17 +; CHECK-NEXT: mov v31.16b, v26.16b +; CHECK-NEXT: mov v26.16b, v21.16b +; CHECK-NEXT: fmov d15, x0 +; CHECK-NEXT: mov v21.16b, v16.16b +; CHECK-NEXT: mov v16.16b, v2.16b +; CHECK-NEXT: mov v0.16b, v14.16b +; CHECK-NEXT: mul x20, x2, x5 +; CHECK-NEXT: mov v7.16b, v10.16b +; CHECK-NEXT: mov v10.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v3.16b ; CHECK-NEXT: add x11, x11, #8 -; CHECK-NEXT: add x12, x12, #1 -; CHECK-NEXT: mov v15.d[1], x1 -; CHECK-NEXT: mul x4, x14, x15 +; CHECK-NEXT: mov v15.d[1], x18 +; CHECK-NEXT: mul x3, x14, x17 ; CHECK-NEXT: cmp x11, #64 -; CHECK-NEXT: fmov d14, x18 -; CHECK-NEXT: mul x15, x5, x15 -; CHECK-NEXT: add v5.2d, v5.2d, v15.2d -; CHECK-NEXT: mul x21, x2, x6 -; CHECK-NEXT: mov v14.d[1], x0 -; CHECK-NEXT: mul x2, x2, x20 -; CHECK-NEXT: fmov d0, x15 -; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: ldr q5, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: mul x22, x13, x20 -; CHECK-NEXT: add v5.2d, v5.2d, v14.2d -; CHECK-NEXT: fmov d3, x21 -; CHECK-NEXT: mul x19, x17, x6 -; CHECK-NEXT: mov v0.d[1], x4 -; CHECK-NEXT: fmov d1, x2 -; CHECK-NEXT: mul x17, x17, x20 -; CHECK-NEXT: str q5, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: add v5.2d, v13.2d, v14.2d -; CHECK-NEXT: fmov d2, x22 -; CHECK-NEXT: ldr q13, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: mul x7, x16, x6 -; CHECK-NEXT: ldp q15, q14, [sp, #16] // 32-byte Folded Reload -; CHECK-NEXT: mov v3.d[1], x19 -; CHECK-NEXT: add v13.2d, v13.2d, v0.2d -; CHECK-NEXT: mul x16, x16, x20 -; CHECK-NEXT: mov v1.d[1], x17 -; CHECK-NEXT: mul x23, x5, x20 -; CHECK-NEXT: str q13, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: mov v13.16b, v5.16b -; CHECK-NEXT: mov v5.16b, v17.16b -; CHECK-NEXT: mov v17.16b, v20.16b -; CHECK-NEXT: mov v20.16b, v24.16b -; CHECK-NEXT: mul x13, x13, x6 -; CHECK-NEXT: mov v24.16b, v28.16b -; CHECK-NEXT: add v11.2d, v11.2d, v3.2d -; CHECK-NEXT: mov v2.d[1], x16 +; CHECK-NEXT: mov v0.d[1], x1 +; CHECK-NEXT: fmov d1, x4 +; CHECK-NEXT: add x12, x12, #1 +; CHECK-NEXT: mul x17, x17, x5 +; CHECK-NEXT: fmov d5, x20 +; CHECK-NEXT: mul x6, x15, x15 +; CHECK-NEXT: add v23.2d, v23.2d, v0.2d +; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: mov v1.d[1], x3 +; CHECK-NEXT: mul x7, x15, x5 +; CHECK-NEXT: add v0.2d, v0.2d, v15.2d +; CHECK-NEXT: fmov d2, x17 +; CHECK-NEXT: mul x0, x14, x5 +; CHECK-NEXT: fmov d4, x6 +; CHECK-NEXT: mul x19, x16, x5 +; CHECK-NEXT: stp q0, q23, [sp, #64] // 32-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: fmov d3, x7 +; CHECK-NEXT: ldr q23, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: mul x17, x2, x15 +; CHECK-NEXT: add v0.2d, v0.2d, v15.2d +; CHECK-NEXT: ldr q15, [sp] // 16-byte Folded Reload +; CHECK-NEXT: mov v5.d[1], x0 +; CHECK-NEXT: mov v4.d[1], x6 +; CHECK-NEXT: mul x16, x16, x15 +; CHECK-NEXT: mov v3.d[1], x7 ; CHECK-NEXT: add v15.2d, v15.2d, v1.2d -; CHECK-NEXT: add v27.2d, v27.2d, v3.2d -; CHECK-NEXT: mul x18, x14, x20 -; CHECK-NEXT: add v23.2d, v23.2d, v3.2d -; CHECK-NEXT: add v19.2d, v19.2d, v3.2d -; CHECK-NEXT: fmov d4, x23 -; CHECK-NEXT: add v10.2d, v10.2d, v3.2d -; CHECK-NEXT: mul x15, x5, x6 -; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: add v14.2d, v14.2d, v2.2d -; CHECK-NEXT: add v2.2d, v6.2d, v3.2d -; CHECK-NEXT: mul x14, x14, x6 -; CHECK-NEXT: mov v3.16b, v7.16b -; CHECK-NEXT: mov v7.16b, v18.16b -; CHECK-NEXT: mov v4.d[1], x18 -; CHECK-NEXT: mov v18.16b, v22.16b -; CHECK-NEXT: mov v0.d[1], x7 -; CHECK-NEXT: fmov d1, x15 -; CHECK-NEXT: add v28.2d, v8.2d, v4.2d -; CHECK-NEXT: mov v1.d[1], x14 -; CHECK-NEXT: add v31.2d, v31.2d, v0.2d -; CHECK-NEXT: add v30.2d, v30.2d, v0.2d +; CHECK-NEXT: mov v2.d[1], x19 +; CHECK-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: mov v1.16b, v6.16b +; CHECK-NEXT: mul x14, x14, x15 +; CHECK-NEXT: mov v6.16b, v20.16b +; CHECK-NEXT: mov v20.16b, v25.16b +; CHECK-NEXT: fmov d0, x17 +; CHECK-NEXT: mov v25.16b, v30.16b +; CHECK-NEXT: add v30.2d, v9.2d, v5.2d +; CHECK-NEXT: mov v5.16b, v19.16b +; CHECK-NEXT: mov v19.16b, v24.16b +; CHECK-NEXT: add v11.2d, v11.2d, v3.2d +; CHECK-NEXT: mov v14.d[1], x16 +; CHECK-NEXT: mov v3.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v10.16b +; CHECK-NEXT: mov v10.16b, v7.16b +; CHECK-NEXT: add v8.2d, v8.2d, v2.2d +; CHECK-NEXT: mov v2.16b, v16.16b +; CHECK-NEXT: mov v0.d[1], x14 +; CHECK-NEXT: mov v16.16b, v21.16b +; CHECK-NEXT: mov v21.16b, v26.16b +; CHECK-NEXT: add v13.2d, v13.2d, v4.2d +; CHECK-NEXT: add v26.2d, v31.2d, v4.2d +; CHECK-NEXT: add v24.2d, v28.2d, v4.2d +; CHECK-NEXT: add v19.2d, v19.2d, v4.2d +; CHECK-NEXT: add v6.2d, v6.2d, v4.2d +; CHECK-NEXT: add v1.2d, v1.2d, v4.2d +; CHECK-NEXT: ldp q4, q7, [sp, #16] // 32-byte Folded Reload +; CHECK-NEXT: add v10.2d, v10.2d, v14.2d +; CHECK-NEXT: add v29.2d, v29.2d, v14.2d +; CHECK-NEXT: add v27.2d, v27.2d, v14.2d +; CHECK-NEXT: add v23.2d, v23.2d, v14.2d +; CHECK-NEXT: add v22.2d, v22.2d, v14.2d +; CHECK-NEXT: add v20.2d, v20.2d, v14.2d +; CHECK-NEXT: add v16.2d, v16.2d, v14.2d +; CHECK-NEXT: add v7.2d, v7.2d, v14.2d +; CHECK-NEXT: add v5.2d, v5.2d, v14.2d +; CHECK-NEXT: add v3.2d, v3.2d, v14.2d +; CHECK-NEXT: add v2.2d, v2.2d, v14.2d ; CHECK-NEXT: add v12.2d, v12.2d, v0.2d -; CHECK-NEXT: add v24.2d, v24.2d, v0.2d -; CHECK-NEXT: add v22.2d, v26.2d, v0.2d -; CHECK-NEXT: add v20.2d, v20.2d, v0.2d -; CHECK-NEXT: add v18.2d, v18.2d, v0.2d +; CHECK-NEXT: add v25.2d, v25.2d, v0.2d +; CHECK-NEXT: add v21.2d, v21.2d, v0.2d ; CHECK-NEXT: add v17.2d, v17.2d, v0.2d -; CHECK-NEXT: add v7.2d, v7.2d, v0.2d -; CHECK-NEXT: add v4.2d, v16.2d, v0.2d -; CHECK-NEXT: add v3.2d, v3.2d, v0.2d -; CHECK-NEXT: mov v0.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v29.16b -; CHECK-NEXT: ldr q29, [sp] // 16-byte Folded Reload -; CHECK-NEXT: add v9.2d, v9.2d, v1.2d -; CHECK-NEXT: add v6.2d, v25.2d, v1.2d -; CHECK-NEXT: add v5.2d, v5.2d, v1.2d -; CHECK-NEXT: add v29.2d, v29.2d, v1.2d -; CHECK-NEXT: add v21.2d, v21.2d, v1.2d -; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: add v4.2d, v4.2d, v0.2d +; CHECK-NEXT: add v0.2d, v18.2d, v0.2d +; CHECK-NEXT: mov x14, x13 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup -; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp q28, q18, [sp, #64] // 32-byte Folded Reload ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: stp q11, q30, [x8, #80] -; CHECK-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload -; CHECK-NEXT: str q1, [x8] -; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x23, [sp, #160] // 8-byte Folded Reload -; CHECK-NEXT: stp q15, q14, [x8, #144] -; CHECK-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload -; CHECK-NEXT: stp q1, q13, [x8, #16] -; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: stp q28, q12, [x8, #176] -; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: stp q1, q31, [x8, #48] -; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: stp q9, q24, [x8, #240] -; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload -; CHECK-NEXT: stp q19, q18, [x8, #336] -; CHECK-NEXT: stp q10, q7, [x8, #400] -; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload -; CHECK-NEXT: str q29, [x8, #112] -; CHECK-NEXT: str q27, [x8, #208] -; CHECK-NEXT: stp q23, q22, [x8, #272] +; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload +; CHECK-NEXT: stp q10, q13, [x8, #64] +; CHECK-NEXT: stp q28, q18, [x8] +; CHECK-NEXT: ldr q18, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: stp q29, q12, [x8, #96] +; CHECK-NEXT: ldp d13, d12, [sp, #128] // 16-byte Folded Reload +; CHECK-NEXT: stp q18, q15, [x8, #32] +; CHECK-NEXT: ldp d15, d14, [sp, #112] // 16-byte Folded Reload +; CHECK-NEXT: stp q11, q8, [x8, #144] +; CHECK-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload +; CHECK-NEXT: stp q30, q27, [x8, #176] +; CHECK-NEXT: ldp d11, d10, [sp, #144] // 16-byte Folded Reload +; CHECK-NEXT: str q26, [x8, #208] +; CHECK-NEXT: stp q25, q23, [x8, #240] +; CHECK-NEXT: stp q24, q22, [x8, #272] ; CHECK-NEXT: stp q21, q20, [x8, #304] -; CHECK-NEXT: stp q6, q17, [x8, #368] -; CHECK-NEXT: stp q5, q4, [x8, #432] -; CHECK-NEXT: stp q2, q3, [x8, #464] +; CHECK-NEXT: stp q19, q7, [x8, #336] +; CHECK-NEXT: stp q17, q16, [x8, #368] +; CHECK-NEXT: stp q6, q5, [x8, #400] +; CHECK-NEXT: stp q4, q3, [x8, #432] +; CHECK-NEXT: stp q1, q2, [x8, #464] ; CHECK-NEXT: str q0, [x8, #496] -; CHECK-NEXT: add sp, sp, #208 +; CHECK-NEXT: add sp, sp, #192 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 ; CHECK-NEXT: .cfi_restore w20 -; CHECK-NEXT: .cfi_restore w21 -; CHECK-NEXT: .cfi_restore w22 -; CHECK-NEXT: .cfi_restore w23 ; CHECK-NEXT: .cfi_restore b8 ; CHECK-NEXT: .cfi_restore b9 ; CHECK-NEXT: .cfi_restore b10 diff --git a/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll b/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll index eb3a0391eb79e..0ed29b48cf2f8 100644 --- a/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll +++ b/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll @@ -4,36 +4,35 @@ define i8 @scalarize_v16i8(ptr %p) { ; CHECK-LABEL: scalarize_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: umov w8, v0.b[0] -; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: umov w16, v0.b[8] -; CHECK-NEXT: umov w17, v0.b[9] -; CHECK-NEXT: umov w18, v0.b[10] -; CHECK-NEXT: umov w0, v0.b[11] -; CHECK-NEXT: umov w1, v0.b[12] -; CHECK-NEXT: umov w2, v0.b[13] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: umov w3, v0.b[14] -; CHECK-NEXT: umov w4, v0.b[15] -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w10, w12, w13 -; CHECK-NEXT: add w11, w14, w15 +; CHECK-NEXT: ldrb w8, [x0, #3] +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: ldrb w10, [x0, #1] +; CHECK-NEXT: ldrb w11, [x0] +; CHECK-NEXT: ldrb w13, [x0, #5] +; CHECK-NEXT: ldrb w14, [x0, #4] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: ldrb w12, [x0, #15] +; CHECK-NEXT: ldrb w15, [x0, #11] +; CHECK-NEXT: add w10, w11, w10 +; CHECK-NEXT: add w9, w14, w13 +; CHECK-NEXT: ldrb w11, [x0, #10] +; CHECK-NEXT: ldrb w13, [x0, #9] +; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: ldrb w14, [x0, #8] +; CHECK-NEXT: ldrb w16, [x0, #7] +; CHECK-NEXT: add w11, w11, w15 +; CHECK-NEXT: ldrb w17, [x0, #6] +; CHECK-NEXT: ldrb w18, [x0, #14] +; CHECK-NEXT: add w13, w14, w13 +; CHECK-NEXT: ldrb w1, [x0, #13] +; CHECK-NEXT: ldrb w0, [x0, #12] +; CHECK-NEXT: add w16, w17, w16 +; CHECK-NEXT: add w10, w13, w11 +; CHECK-NEXT: add w12, w18, w12 +; CHECK-NEXT: add w9, w9, w16 +; CHECK-NEXT: add w14, w0, w1 ; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w12, w16, w17 -; CHECK-NEXT: add w13, w18, w0 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w14, w1, w2 -; CHECK-NEXT: add w10, w12, w13 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w15, w3, w4 -; CHECK-NEXT: add w11, w14, w15 +; CHECK-NEXT: add w11, w14, w12 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret @@ -75,22 +74,21 @@ define i8 @scalarize_v16i8(ptr %p) { define i8 @scalarize_v8i8(ptr %p) { ; CHECK-LABEL: scalarize_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: umov w8, v0.b[0] -; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w10, w12, w13 -; CHECK-NEXT: add w11, w14, w15 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ldrb w8, [x0, #7] +; CHECK-NEXT: ldrb w9, [x0, #6] +; CHECK-NEXT: ldrb w10, [x0, #5] +; CHECK-NEXT: ldrb w11, [x0, #1] +; CHECK-NEXT: ldrb w12, [x0] +; CHECK-NEXT: ldrb w13, [x0, #4] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: ldrb w14, [x0, #3] +; CHECK-NEXT: ldrb w15, [x0, #2] +; CHECK-NEXT: add w11, w12, w11 +; CHECK-NEXT: add w10, w13, w10 +; CHECK-NEXT: add w12, w15, w14 +; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: add w9, w11, w12 +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %wide.load = load <8 x i8>, ptr %p, align 4 %l0 = extractelement <8 x i8> %wide.load, i32 0 @@ -114,22 +112,21 @@ define i8 @scalarize_v8i8(ptr %p) { define i16 @scalarize_v8i16(ptr %p) { ; CHECK-LABEL: scalarize_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: umov w12, v0.h[4] -; CHECK-NEXT: umov w13, v0.h[5] -; CHECK-NEXT: umov w14, v0.h[6] -; CHECK-NEXT: umov w15, v0.h[7] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w10, w12, w13 -; CHECK-NEXT: add w11, w14, w15 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ldrh w8, [x0, #14] +; CHECK-NEXT: ldrh w9, [x0, #12] +; CHECK-NEXT: ldrh w10, [x0, #10] +; CHECK-NEXT: ldrh w11, [x0, #2] +; CHECK-NEXT: ldrh w12, [x0] +; CHECK-NEXT: ldrh w13, [x0, #8] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: ldrh w14, [x0, #6] +; CHECK-NEXT: ldrh w15, [x0, #4] +; CHECK-NEXT: add w11, w12, w11 +; CHECK-NEXT: add w10, w13, w10 +; CHECK-NEXT: add w12, w15, w14 +; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: add w9, w11, w12 +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %wide.load = load <8 x i16>, ptr %p, align 4 %l0 = extractelement <8 x i16> %wide.load, i32 0 @@ -153,14 +150,13 @@ define i16 @scalarize_v8i16(ptr %p) { define i16 @scalarize_v4i16(ptr %p) { ; CHECK-LABEL: scalarize_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ldrh w8, [x0, #6] +; CHECK-NEXT: ldrh w9, [x0, #4] +; CHECK-NEXT: ldrh w10, [x0, #2] +; CHECK-NEXT: ldrh w11, [x0] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: add w10, w11, w10 +; CHECK-NEXT: add w0, w10, w8 ; CHECK-NEXT: ret %wide.load = load <4 x i16>, ptr %p, align 4 %l0 = extractelement <4 x i16> %wide.load, i32 0 @@ -176,13 +172,10 @@ define i16 @scalarize_v4i16(ptr %p) { define i32 @scalarize_v4i32(ptr %p) { ; CHECK-LABEL: scalarize_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: add w8, w11, w8 -; CHECK-NEXT: add w9, w9, w10 +; CHECK-NEXT: ldp w9, w8, [x0] +; CHECK-NEXT: ldp w10, w11, [x0, #8] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 @@ -199,11 +192,10 @@ define i32 @scalarize_v4i32(ptr %p) { define i64 @scalarize_v4i64(ptr %p) { ; CHECK-LABEL: scalarize_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: addp d1, v1.2d -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: ldp x10, x11, [x0, #16] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: add x9, x10, x11 ; CHECK-NEXT: add x0, x8, x9 ; CHECK-NEXT: ret %wide.load = load <4 x i64>, ptr %p, align 4 @@ -220,14 +212,11 @@ define i64 @scalarize_v4i64(ptr %p) { define i64 @scalarize_v4i32_sext(ptr %p) { ; CHECK-LABEL: scalarize_v4i32_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0 -; CHECK-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: addp d1, v1.2d -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ldpsw x9, x8, [x0, #8] +; CHECK-NEXT: ldpsw x11, x10, [x0] +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: add x10, x11, x10 +; CHECK-NEXT: add x0, x10, x8 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %ext = sext <4 x i32> %wide.load to <4 x i64> @@ -244,14 +233,11 @@ define i64 @scalarize_v4i32_sext(ptr %p) { define i64 @scalarize_v4i32_zext(ptr %p) { ; CHECK-LABEL: scalarize_v4i32_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: addp d1, v1.2d -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ldp w9, w8, [x0, #8] +; CHECK-NEXT: ldp w11, w10, [x0] +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: add x10, x11, x10 +; CHECK-NEXT: add x0, x10, x8 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %ext = zext <4 x i32> %wide.load to <4 x i64> @@ -340,55 +326,43 @@ define double @scalarize_v4f64(ptr %p) { define float @scalarize_into_load(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q0, [x1] -; CHECK-NEXT: ldp q3, q2, [x1, #96] -; CHECK-NEXT: ldp q5, q4, [x1, #64] -; CHECK-NEXT: ldp q7, q6, [x1, #32] -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: mov x1, v3.d[1] -; CHECK-NEXT: mov x4, v2.d[1] -; CHECK-NEXT: mov x16, v5.d[1] -; CHECK-NEXT: mov x18, v4.d[1] -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: mov x12, v7.d[1] -; CHECK-NEXT: mov x14, v6.d[1] -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: fmov x13, d7 -; CHECK-NEXT: fmov x15, d6 -; CHECK-NEXT: fmov x17, d5 -; CHECK-NEXT: fmov x0, d4 -; CHECK-NEXT: fmov x3, d3 -; CHECK-NEXT: fmov x5, d2 -; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] -; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] -; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] -; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] -; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] -; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] -; CHECK-NEXT: ldr s6, [x2, x15, lsl #2] -; CHECK-NEXT: ldr s7, [x2, x14, lsl #2] -; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] -; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] -; CHECK-NEXT: ldr s18, [x2, x0, lsl #2] -; CHECK-NEXT: ldr s19, [x2, x18, lsl #2] -; CHECK-NEXT: ldr s20, [x2, x3, lsl #2] -; CHECK-NEXT: ldr s21, [x2, x1, lsl #2] -; CHECK-NEXT: ldr s22, [x2, x5, lsl #2] -; CHECK-NEXT: ldr s23, [x2, x4, lsl #2] +; CHECK-NEXT: ldp x8, x9, [x1] +; CHECK-NEXT: ldp x10, x11, [x1, #16] +; CHECK-NEXT: ldp x12, x13, [x1, #64] +; CHECK-NEXT: ldr s0, [x2, x8, lsl #2] +; CHECK-NEXT: ldr s1, [x2, x9, lsl #2] +; CHECK-NEXT: ldp x8, x9, [x1, #32] +; CHECK-NEXT: ldr s2, [x2, x10, lsl #2] +; CHECK-NEXT: ldr s3, [x2, x11, lsl #2] ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldr s6, [x2, x12, lsl #2] +; CHECK-NEXT: ldp x10, x11, [x1, #48] +; CHECK-NEXT: ldr s7, [x2, x13, lsl #2] ; CHECK-NEXT: fadd s1, s2, s3 -; CHECK-NEXT: fadd s2, s4, s5 -; CHECK-NEXT: fadd s3, s6, s7 -; CHECK-NEXT: fadd s4, s16, s17 -; CHECK-NEXT: fadd s5, s18, s19 -; CHECK-NEXT: fadd s6, s20, s21 -; CHECK-NEXT: fadd s7, s22, s23 +; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] +; CHECK-NEXT: ldr s3, [x2, x9, lsl #2] +; CHECK-NEXT: ldp x14, x15, [x1, #80] +; CHECK-NEXT: fadd s2, s2, s3 +; CHECK-NEXT: ldr s4, [x2, x10, lsl #2] +; CHECK-NEXT: ldr s5, [x2, x11, lsl #2] +; CHECK-NEXT: ldp x16, x17, [x1, #96] +; CHECK-NEXT: fadd s3, s4, s5 +; CHECK-NEXT: fadd s4, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldp x18, x0, [x1, #112] +; CHECK-NEXT: ldr s16, [x2, x14, lsl #2] +; CHECK-NEXT: ldr s17, [x2, x15, lsl #2] +; CHECK-NEXT: ldr s18, [x2, x16, lsl #2] +; CHECK-NEXT: ldr s19, [x2, x17, lsl #2] +; CHECK-NEXT: ldr s20, [x2, x18, lsl #2] +; CHECK-NEXT: ldr s21, [x2, x0, lsl #2] +; CHECK-NEXT: fadd s5, s16, s17 +; CHECK-NEXT: fadd s6, s18, s19 ; CHECK-NEXT: fadd s1, s2, s3 +; CHECK-NEXT: fadd s7, s20, s21 ; CHECK-NEXT: fadd s2, s4, s5 -; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret @@ -463,57 +437,39 @@ entry: define float @scalarize_into_load_sext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q0, q2, [x1] -; CHECK-NEXT: ldp q4, q1, [x1, #32] -; CHECK-NEXT: sshll v3.2d, v0.2s, #0 -; CHECK-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: sshll2 v6.2d, v2.4s, #0 -; CHECK-NEXT: sshll2 v5.2d, v1.4s, #0 -; CHECK-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-NEXT: sshll v2.2d, v2.2s, #0 -; CHECK-NEXT: sshll2 v7.2d, v4.4s, #0 -; CHECK-NEXT: sshll v4.2d, v4.2s, #0 -; CHECK-NEXT: mov x8, v3.d[1] -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: mov x14, v6.d[1] -; CHECK-NEXT: mov x12, v2.d[1] -; CHECK-NEXT: mov x1, v1.d[1] -; CHECK-NEXT: mov x4, v5.d[1] -; CHECK-NEXT: mov x16, v4.d[1] -; CHECK-NEXT: mov x18, v7.d[1] -; CHECK-NEXT: fmov x9, d3 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: fmov x13, d2 -; CHECK-NEXT: fmov x15, d6 -; CHECK-NEXT: fmov x17, d4 -; CHECK-NEXT: fmov x0, d7 -; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] -; CHECK-NEXT: fmov x3, d1 -; CHECK-NEXT: fmov x5, d5 +; CHECK-NEXT: ldpsw x9, x8, [x1] +; CHECK-NEXT: ldpsw x11, x10, [x1, #8] +; CHECK-NEXT: ldpsw x13, x12, [x1, #24] ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] -; CHECK-NEXT: ldr s1, [x2, x11, lsl #2] +; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] +; CHECK-NEXT: ldpsw x9, x8, [x1, #56] +; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldpsw x11, x10, [x1, #48] +; CHECK-NEXT: ldpsw x15, x14, [x1, #16] +; CHECK-NEXT: ldpsw x17, x16, [x1, #40] +; CHECK-NEXT: ldpsw x0, x18, [x1, #32] +; CHECK-NEXT: fadd s1, s2, s3 +; CHECK-NEXT: ldr s2, [x2, x15, lsl #2] +; CHECK-NEXT: ldr s3, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] -; CHECK-NEXT: ldr s6, [x2, x15, lsl #2] -; CHECK-NEXT: ldr s7, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] +; CHECK-NEXT: ldr s6, [x2, x0, lsl #2] +; CHECK-NEXT: fadd s2, s2, s3 +; CHECK-NEXT: ldr s7, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] -; CHECK-NEXT: ldr s18, [x2, x0, lsl #2] -; CHECK-NEXT: ldr s19, [x2, x18, lsl #2] -; CHECK-NEXT: ldr s20, [x2, x3, lsl #2] -; CHECK-NEXT: ldr s21, [x2, x1, lsl #2] -; CHECK-NEXT: ldr s22, [x2, x5, lsl #2] -; CHECK-NEXT: ldr s23, [x2, x4, lsl #2] -; CHECK-NEXT: fadd s0, s0, s2 -; CHECK-NEXT: fadd s1, s1, s3 -; CHECK-NEXT: fadd s2, s4, s5 -; CHECK-NEXT: fadd s3, s6, s7 -; CHECK-NEXT: fadd s4, s16, s17 -; CHECK-NEXT: fadd s5, s18, s19 -; CHECK-NEXT: fadd s6, s20, s21 -; CHECK-NEXT: fadd s7, s22, s23 +; CHECK-NEXT: fadd s3, s4, s5 +; CHECK-NEXT: ldr s18, [x2, x11, lsl #2] +; CHECK-NEXT: ldr s19, [x2, x10, lsl #2] +; CHECK-NEXT: fadd s4, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldr s20, [x2, x9, lsl #2] +; CHECK-NEXT: ldr s21, [x2, x8, lsl #2] +; CHECK-NEXT: fadd s5, s16, s17 +; CHECK-NEXT: fadd s6, s18, s19 +; CHECK-NEXT: fadd s7, s20, s21 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 @@ -593,57 +549,39 @@ entry: define float @scalarize_into_load_zext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q0, q2, [x1] -; CHECK-NEXT: ldp q4, q1, [x1, #32] -; CHECK-NEXT: ushll v3.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v6.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v5.2d, v1.4s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v7.2d, v4.4s, #0 -; CHECK-NEXT: ushll v4.2d, v4.2s, #0 -; CHECK-NEXT: mov x8, v3.d[1] -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: mov x14, v6.d[1] -; CHECK-NEXT: mov x12, v2.d[1] -; CHECK-NEXT: mov x1, v1.d[1] -; CHECK-NEXT: mov x4, v5.d[1] -; CHECK-NEXT: mov x16, v4.d[1] -; CHECK-NEXT: mov x18, v7.d[1] -; CHECK-NEXT: fmov x9, d3 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: fmov x13, d2 -; CHECK-NEXT: fmov x15, d6 -; CHECK-NEXT: fmov x17, d4 -; CHECK-NEXT: fmov x0, d7 -; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] -; CHECK-NEXT: fmov x3, d1 -; CHECK-NEXT: fmov x5, d5 +; CHECK-NEXT: ldp w9, w8, [x1] +; CHECK-NEXT: ldp w11, w10, [x1, #8] +; CHECK-NEXT: ldp w13, w12, [x1, #24] ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] -; CHECK-NEXT: ldr s1, [x2, x11, lsl #2] +; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] +; CHECK-NEXT: ldp w9, w8, [x1, #56] +; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldp w11, w10, [x1, #48] +; CHECK-NEXT: ldp w15, w14, [x1, #16] +; CHECK-NEXT: ldp w17, w16, [x1, #40] +; CHECK-NEXT: ldp w0, w18, [x1, #32] +; CHECK-NEXT: fadd s1, s2, s3 +; CHECK-NEXT: ldr s2, [x2, x15, lsl #2] +; CHECK-NEXT: ldr s3, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] -; CHECK-NEXT: ldr s6, [x2, x15, lsl #2] -; CHECK-NEXT: ldr s7, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] +; CHECK-NEXT: ldr s6, [x2, x0, lsl #2] +; CHECK-NEXT: fadd s2, s2, s3 +; CHECK-NEXT: ldr s7, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] -; CHECK-NEXT: ldr s18, [x2, x0, lsl #2] -; CHECK-NEXT: ldr s19, [x2, x18, lsl #2] -; CHECK-NEXT: ldr s20, [x2, x3, lsl #2] -; CHECK-NEXT: ldr s21, [x2, x1, lsl #2] -; CHECK-NEXT: ldr s22, [x2, x5, lsl #2] -; CHECK-NEXT: ldr s23, [x2, x4, lsl #2] -; CHECK-NEXT: fadd s0, s0, s2 -; CHECK-NEXT: fadd s1, s1, s3 -; CHECK-NEXT: fadd s2, s4, s5 -; CHECK-NEXT: fadd s3, s6, s7 -; CHECK-NEXT: fadd s4, s16, s17 -; CHECK-NEXT: fadd s5, s18, s19 -; CHECK-NEXT: fadd s6, s20, s21 -; CHECK-NEXT: fadd s7, s22, s23 +; CHECK-NEXT: fadd s3, s4, s5 +; CHECK-NEXT: ldr s18, [x2, x11, lsl #2] +; CHECK-NEXT: ldr s19, [x2, x10, lsl #2] +; CHECK-NEXT: fadd s4, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldr s20, [x2, x9, lsl #2] +; CHECK-NEXT: ldr s21, [x2, x8, lsl #2] +; CHECK-NEXT: fadd s5, s16, s17 +; CHECK-NEXT: fadd s6, s18, s19 +; CHECK-NEXT: fadd s7, s20, s21 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 505a40c16653b..d00efa7d99d53 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -374,8 +374,8 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl foo ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ldr z0, [sp, #2, mul vl] -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: addvl x8, sp, #2 +; CHECK-NEXT: ldrb w0, [x8] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll index f6ed2e6a787f0..ba7bee9a94bac 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll @@ -19,14 +19,12 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap) vscale_range(2,0) #0 { define <2 x i256> @load_zext_v2i64i256(ptr %ap) #0 { ; CHECK-LABEL: load_zext_v2i64i256: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldp x0, x4, [x0] ; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: mov x2, xzr ; CHECK-NEXT: mov x3, xzr ; CHECK-NEXT: mov x5, xzr ; CHECK-NEXT: mov x6, xzr -; CHECK-NEXT: mov x4, v0.d[1] -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: mov x7, xzr ; CHECK-NEXT: ret %a = load <2 x i64>, ptr %ap diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index ebd32c73ec65b..6fd5b820a2242 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -438,8 +438,7 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: // implicit-def: $d0 ; CHECK-NEXT: cbnz x8, .LBB15_2 ; CHECK-NEXT: // %bb.1: // %cond.load diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll index a69808d32ed73..4f5a5a6dee257 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -727,8 +727,8 @@ define void @load_splat_v4f64(ptr %a, ptr %b) vscale_range(2,2) #0 { define void @load_splat_v32i8(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z0.b, b0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] ; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %v = load <32 x i8>, ptr %a @@ -740,8 +740,8 @@ define void @load_splat_v32i8(ptr %a, ptr %b) vscale_range(2,2) #0 { define void @load_splat_v16i16(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %a @@ -753,8 +753,8 @@ define void @load_splat_v16i16(ptr %a, ptr %b) vscale_range(2,2) #0 { define void @load_splat_v8i32(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %v = load <8 x i32>, ptr %a @@ -766,8 +766,8 @@ define void @load_splat_v8i32(ptr %a, ptr %b) vscale_range(2,2) #0 { define void @load_splat_v4i64(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %v = load <4 x i64>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 4d524bc848de6..e433786cfdd1f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -99,16 +99,14 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) { define <2 x i256> @load_zext_v2i64i256(ptr %ap) { ; CHECK-LABEL: load_zext_v2i64i256: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldp x8, x4, [x0] ; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: mov x2, xzr ; CHECK-NEXT: mov x3, xzr ; CHECK-NEXT: mov x5, xzr ; CHECK-NEXT: mov x6, xzr -; CHECK-NEXT: mov z1.d, z0.d[1] -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: mov x7, xzr -; CHECK-NEXT: fmov x4, d1 +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: load_zext_v2i64i256: @@ -282,14 +280,12 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) { define <2 x i256> @load_sext_v2i64i256(ptr %ap) { ; CHECK-LABEL: load_sext_v2i64i256: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z1.d, z0.d[1] -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: fmov x4, d1 -; CHECK-NEXT: asr x1, x0, #63 +; CHECK-NEXT: ldp x8, x4, [x0] +; CHECK-NEXT: asr x1, x8, #63 +; CHECK-NEXT: asr x5, x4, #63 +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: mov x2, x1 ; CHECK-NEXT: mov x3, x1 -; CHECK-NEXT: asr x5, x4, #63 ; CHECK-NEXT: mov x6, x5 ; CHECK-NEXT: mov x7, x5 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index e6c6003ee6c69..094eaad0cfe80 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -115,9 +115,9 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind { ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: ldr q0, [sp] ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: ldrb w8, [sp, #16] ; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b -; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: stur b1, [x19, #8] +; CHECK-NEXT: strb w8, [x19, #8] ; CHECK-NEXT: str d0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll index ea6123edc8b4c..7b9b69e0d9b4d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -101,15 +101,13 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: shuffle_ext_byone_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.b, z0.b[15] -; CHECK-NEXT: mov z2.b, z1.b[15] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: insr z1.b, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: insr z3.b, w8 -; CHECK-NEXT: stp q1, q3, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldrb w8, [x0, #31] +; CHECK-NEXT: mov z1.b, z0.b[15] +; CHECK-NEXT: insr z0.b, w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: insr z2.b, w8 +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8: @@ -238,15 +236,13 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: shuffle_ext_byone_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.h, z0.h[7] -; CHECK-NEXT: mov z2.h, z1.h[7] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: insr z1.h, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: insr z3.h, w8 -; CHECK-NEXT: stp q1, q3, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldrh w8, [x0, #30] +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: insr z0.h, w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: insr z2.h, w8 +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16: @@ -341,15 +337,13 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: shuffle_ext_byone_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: mov z2.s, z1.s[3] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: insr z1.s, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: insr z3.s, w8 -; CHECK-NEXT: stp q1, q3, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldr w8, [x0, #28] +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: insr z0.s, w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: insr z2.s, w8 +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32: @@ -409,15 +403,13 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: shuffle_ext_byone_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: mov z2.d, z1.d[1] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: insr z1.d, x8 -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: insr z3.d, x8 -; CHECK-NEXT: stp q1, q3, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldr x8, [x0, #24] +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: insr z0.d, x8 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: insr z2.d, x8 +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64: diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index 9165493863729..55c343164a1b8 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -462,10 +462,9 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i ; CHECK-NEXT: orr x8, x9, x8, lsl #1 ; CHECK-NEXT: strh w1, [x10] ; CHECK-NEXT: strh w2, [x8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: umov.h w0, v0[0] -; CHECK-NEXT: umov.h w1, v0[1] -; CHECK-NEXT: umov.h w2, v0[2] +; CHECK-NEXT: ldrh w0, [sp, #8] +; CHECK-NEXT: ldrh w1, [sp, #10] +; CHECK-NEXT: ldrh w2, [sp, #12] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)